Introduction
export HS_TYPE=gene
export HS_TAG=ID
Testing
directory
I think for speed sake, I will run these through salmon.
I have been making some changes to my pipeline recently; so I will
put 100k reads into a test directory.
cd preprocessing
mkdir test
cd test
less ../SRR/*R1* | head -n 400000 > r1.fastq
less ../SRR/*R2* | head -n 400000 > r2.fastq
gzip *.fastq
inputs=$(/bin/ls outputs/12fastp/*-fastp.fastq.xz | tr '\n' ':')
cyoa --method salmon --species hg38_111 --input $inputs --libtype CDS --jprefix 20
Trimming via
fastp
I am going to use that tree to run everything first, notably I want
my tools to collect more thorough statistics on runtime etc to improve
my nascent heuristics to choose memory/time on the cluster.
start=$(pwd)
for i in $(/bin/ls -d SR*); do
cd ${start}/${i}
mkdir unprocessed
mv *.fastq.gz unprocessed/
cyoa --method fastp --input $(/bin/ls -d unprocessed/*.fastq.gz | tr '\n' ':')
done
cd $start
Salmon
quantification
start=$(pwd)
for i in $(/bin/ls -d SR*); do
cd ${start}/${i}
inputs=$(/bin/ls outputs/12fastp/*-fastp.fastq.xz | tr '\n' ':')
cyoa --method salmon --species hg38_111 --input $inputs --libtype CDS --jprefix 20
done
cd $start
Load annotations
hs_annot <- load_biomart_annotations(year = 2021, month = 02)
## The biomart annotations file already exists, loading from it.
annot <- hs_annot[["gene_annotations"]]
Create SE
tx_gene_map <- hs_annot[["gene_tx_map"]]
tx_gene_map[["transcript"]] <- gsub(x = tx_gene_map[["transcript"]], pattern = "\\.\\d+$", replacement = "")
rownames(tx_gene_map) <- make.names(gsub(x = rownames(tx_gene_map), pattern = "\\.\\d+$", replacement = ""), unique = TRUE)
## Error in `rownames<-`(`*tmp*`, value = character(0)): attempt to set 'rownames' on an object with no dimensions
hs_se_tx <- create_se(meta[["new_meta"]], gene_info = annot, file_column = "salmon_count_table")
## Reading the sample metadata.
## Checking the state of the condition column.
## Checking the state of the batch column.
## Checking the condition factor.
## The sample definitions comprises: 41 rows(samples) and 21 columns(metadata fields).
## Warning in create_se(meta[["new_meta"]], gene_info = annot, file_column = "salmon_count_table"): Some
## samples were removed when cross referencing the samples against the count data.
## Warning in create_se(meta[["new_meta"]], gene_info = annot, file_column = "salmon_count_table"): Even
## after changing the rownames in gene info, they do not match the count table.
## Even after changing the rownames in gene info, they do not match the count table.
## Here are the first few rownames from the count tables:
## ENST00000000233.10, ENST00000000412.8, ENST00000000442.11, ENST00000001008.6, ENST00000001146.7, ENST00000002125.9
## Here are the first few rownames from the gene information table:
## ENSG00000000003, ENSG00000000005, ENSG00000000419, ENSG00000000457, ENSG00000000460, ENSG00000000938
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the summarized experiment to 'se.rda'.
## The final summarized experiment has 97117 rows and 21 columns.
hs_se_gene <- create_se(meta[["new_meta"]], gene_info = annot, file_column = "salmon_count_table",
tx_gene_map = tx_gene_map)
## Reading the sample metadata.
## Checking the state of the condition column.
## Checking the state of the batch column.
## Checking the condition factor.
## The sample definitions comprises: 41 rows(samples) and 21 columns(metadata fields).
## In some cases, (notably salmon) the format of the IDs used by this can be tricky.
## It is likely to require the transcript ID followed by a '.' and the ensembl column:
## 'transcript_version', which is explicitly different than the gene version column.
## If this is not correctly performed, very few genes will be observed
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## Error in `colnames<-`(`*tmp*`, value = c("tx", "gene")): attempt to set 'colnames' on an object with less than two dimensions
Set
condition/batch
I have extracted two potentially interesting columns from the
metadata, after that I will need to read more carefully in the paper to
try to get a sense of what is what…
hs_se <- set_se_conditions(hs_se_gene, fact = "donor") %>%
set_se_batches(fact = "type")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'colData': error in evaluating the argument 'x' in selecting a method for function 'colData': object 'hs_se_gene' not found
A couple plots
## Error: object 'hs_se' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_libsize': object 'hs_se' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_nonzero': object 'hs_se' not found
Presumably we will need to separate the exome and rnaseq data; but
for the moment I will leave them together to see what I can see.
hs_norm <- normalize(hs_se, transform = "log2", convert = "cpm", filter = TRUE, norm = "quant")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'input' in selecting a method for function 'normalize': object 'hs_se' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'hs_norm' not found
hs_nb <- normalize(hs_se, transform = "log2", convert = "cpm", filter = TRUE, batch = "combat")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'input' in selecting a method for function 'normalize': object 'hs_se' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'hs_nb' not found
Ok, nevermind, the exome data is too different.
hs_rna <- subset_se(hs_se, subset = "batch=='rnaseq'")
## Error: object 'hs_se' not found
hs_rna_norm <- normalize(hs_rna, transform = "log2", convert = "cpm", filter = TRUE, norm = "quant")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'input' in selecting a method for function 'normalize': object 'hs_rna' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'hs_rna_norm' not found
hs_rna_nb <- normalize(hs_rna, transform = "log2", convert = "cpm", filter = TRUE, batch = "svaseq")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'input' in selecting a method for function 'normalize': object 'hs_rna' not found
## Error in h(simpleError(msg, call)): error in evaluating the argument 'data' in selecting a method for function 'plot_pca': object 'hs_rna_nb' not found
LS0tCnRpdGxlOiAiUHJlcHJvY2Vzc2luZyBzb21lIGh1bWFuIHNhbXBsZXMuIgphdXRob3I6ICJhdGIgYWJlbGV3QGdtYWlsLmNvbSIKZGF0ZTogImByIFN5cy5EYXRlKClgIgpvdXRwdXQ6CiAgaHRtbF9kb2N1bWVudDoKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICAgIGNvZGVfZm9sZGluZzogc2hvdwogICAgZmlnX2NhcHRpb246IHRydWUKICAgIGZpZ19oZWlnaHQ6IDcKICAgIGZpZ193aWR0aDogNwogICAgaGlnaGxpZ2h0OiB6ZW5idXJuCiAgICBrZWVwX21kOiBmYWxzZQogICAgbW9kZTogc2VsZmNvbnRhaW5lZAogICAgbnVtYmVyX3NlY3Rpb25zOiB0cnVlCiAgICBzZWxmX2NvbnRhaW5lZDogdHJ1ZQogICAgdGhlbWU6IHJlYWRhYmxlCiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDoKICAgICAgY29sbGFwc2VkOiBmYWxzZQogICAgICBzbW9vdGhfc2Nyb2xsOiBmYWxzZQogIHJtZGZvcm1hdHM6OnJlYWR0aGVkb3duOgogICAgY29kZV9kb3dubG9hZDogdHJ1ZQogICAgY29kZV9mb2xkaW5nOiBzaG93CiAgICBkZl9wcmludDogcGFnZWQKICAgIGZpZ19jYXB0aW9uOiB0cnVlCiAgICBmaWdfaGVpZ2h0OiA3CiAgICBmaWdfd2lkdGg6IDcKICAgIGhpZ2hsaWdodDogemVuYnVybgogICAgd2lkdGg6IDMwMAogICAga2VlcF9tZDogZmFsc2UKICAgIG1vZGU6IHNlbGZjb250YWluZWQKICAgIHRvY19mbG9hdDogdHJ1ZQogIEJpb2NTdHlsZTo6aHRtbF9kb2N1bWVudDoKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICAgIGNvZGVfZm9sZGluZzogc2hvdwogICAgZmlnX2NhcHRpb246IHRydWUKICAgIGZpZ19oZWlnaHQ6IDcKICAgIGZpZ193aWR0aDogNwogICAgaGlnaGxpZ2h0OiB6ZW5idXJuCiAgICBrZWVwX21kOiBmYWxzZQogICAgbW9kZTogc2VsZmNvbnRhaW5lZAogICAgdG9jX2Zsb2F0OiB0cnVlCi0tLQoKPHN0eWxlIHR5cGU9InRleHQvY3NzIj4KYm9keSwgdGQgewogIGZvbnQtc2l6ZTogMTZweDsKfQpjb2RlLnJ7CiAgZm9udC1zaXplOiAxNnB4Owp9CnByZSB7CiAgZm9udC1zaXplOiAxNnB4Cn0KYm9keSAubWFpbi1jb250YWluZXIgewogICBtYXgtd2lkdGg6IDE2MDBweDsKfQo8L3N0eWxlPgoKYGBge3Igb3B0aW9ucywgaW5jbHVkZT1GQUxTRX0KbGlicmFyeShyZXRpY3VsYXRlKQp0dCA8LSB0cnkoZGV2dG9vbHM6OmxvYWRfYWxsKCJ+L2hwZ2x0b29scyIpKQprbml0cjo6b3B0c19rbml0JHNldCgKICBwcm9ncmVzcyA9IFRSVUUsIHZlcmJvc2UgPSBUUlVFLCB3aWR0aCA9IDkwLCBlY2hvID0gVFJVRSkKa25pdHI6Om9wdHNfY2h1bmskc2V0KAogIGVycm9yID0gVFJVRSwgZmlnLndpZHRoID0gOCwgZmlnLmhlaWdodCA9IDgsIGZpZy5yZXRpbmEgPSAyLAogIG91dC53aWR0aCA9ICIxMDAlIiwgZGV2ID0gInBuZyIsCiAgZGV2LmFyZ3MgPSBsaXN0KHBuZyA9IGxpc3QodHlwZSA9ICJjYWlyby1wbmciKSkpCm9sZF9vcHRpb25zIDwtIG9wdGlvbnMoZGlnaXRzID0gNCwgc3RyaW5nc0FzRmFjdG9ycyA9IEZBTFNFLCBrbml0ci5kdXBsaWNhdGUubGFiZWwgPSAiYWxsb3ciKQpnZ3Bsb3QyOjp0aGVtZV9zZXQoZ2dwbG90Mjo6dGhlbWVfYncoYmFzZV9zaXplID0gMTIpKQp2ZXIgPC0gIjIwMjMwNSIKcHJldmlvdXNfZmlsZSA8LSAiIgp2ZXIgPC0gZm9ybWF0KFN5cy5EYXRlKCksICIlWSVtJWQiKQoKIyN0bXAgPC0gc20obG9hZG1lKGZpbGVuYW1lPXBhc3RlMChnc3ViKHBhdHRlcm49IlxcLlJtZCIsIHJlcGxhY2U9IiIsIHg9cHJldmlvdXNfZmlsZSksICItdiIsIHZlciwgIi5yZGEueHoiKSkpCnJtZF9maWxlIDwtICJwcmVwcm9jZXNzLlJtZCIKYGBgCgojIEludHJvZHVjdGlvbgoKYGBge2Jhc2gsIGV2YWw9RkFMU0V9CmV4cG9ydCBIU19UWVBFPWdlbmUKZXhwb3J0IEhTX1RBRz1JRApgYGAKCiMjIFRlc3RpbmcgZGlyZWN0b3J5CgpJIHRoaW5rIGZvciBzcGVlZCBzYWtlLCBJIHdpbGwgcnVuIHRoZXNlIHRocm91Z2ggc2FsbW9uLgoKSSBoYXZlIGJlZW4gbWFraW5nIHNvbWUgY2hhbmdlcyB0byBteSBwaXBlbGluZSByZWNlbnRseTsgc28gSSB3aWxsIHB1dAoxMDBrIHJlYWRzIGludG8gYSB0ZXN0IGRpcmVjdG9yeS4KCmBgYHtiYXNoLCBldmFsPUZBTFNFfQpjZCBwcmVwcm9jZXNzaW5nCm1rZGlyIHRlc3QKY2QgdGVzdApsZXNzIC4uL1NSUi8qUjEqIHwgaGVhZCAtbiA0MDAwMDAgPiByMS5mYXN0cQpsZXNzIC4uL1NSUi8qUjIqIHwgaGVhZCAtbiA0MDAwMDAgPiByMi5mYXN0cQpnemlwICouZmFzdHEKaW5wdXRzPSQoL2Jpbi9scyBvdXRwdXRzLzEyZmFzdHAvKi1mYXN0cC5mYXN0cS54eiB8IHRyICdcbicgJzonKQpjeW9hIC0tbWV0aG9kIHNhbG1vbiAtLXNwZWNpZXMgaGczOF8xMTEgLS1pbnB1dCAkaW5wdXRzIC0tbGlidHlwZSBDRFMgLS1qcHJlZml4IDIwCmBgYAoKIyMgVHJpbW1pbmcgdmlhIGZhc3RwCgpJIGFtIGdvaW5nIHRvIHVzZSB0aGF0IHRyZWUgdG8gcnVuIGV2ZXJ5dGhpbmcgZmlyc3QsIG5vdGFibHkgSSB3YW50IG15CnRvb2xzIHRvIGNvbGxlY3QgbW9yZSB0aG9yb3VnaCBzdGF0aXN0aWNzIG9uIHJ1bnRpbWUgZXRjIHRvIGltcHJvdmUgbXkKbmFzY2VudCBoZXVyaXN0aWNzIHRvIGNob29zZSBtZW1vcnkvdGltZSBvbiB0aGUgY2x1c3Rlci4KCmBgYHtiYXNoLCBldmFsPUZBTFNFfQpzdGFydD0kKHB3ZCkKZm9yIGkgaW4gJCgvYmluL2xzIC1kIFNSKik7IGRvCiAgICBjZCAke3N0YXJ0fS8ke2l9CiAgICBta2RpciB1bnByb2Nlc3NlZAogICAgbXYgKi5mYXN0cS5neiB1bnByb2Nlc3NlZC8KICAgIGN5b2EgLS1tZXRob2QgZmFzdHAgLS1pbnB1dCAkKC9iaW4vbHMgLWQgdW5wcm9jZXNzZWQvKi5mYXN0cS5neiB8IHRyICdcbicgJzonKQpkb25lCmNkICRzdGFydApgYGAKCiMjIFNhbG1vbiBxdWFudGlmaWNhdGlvbgoKYGBge2Jhc2gsIGV2YWw9RkFMU0V9CnN0YXJ0PSQocHdkKQpmb3IgaSBpbiAkKC9iaW4vbHMgLWQgU1IqKTsgZG8KICAgIGNkICR7c3RhcnR9LyR7aX0KICAgIGlucHV0cz0kKC9iaW4vbHMgb3V0cHV0cy8xMmZhc3RwLyotZmFzdHAuZmFzdHEueHogfCB0ciAnXG4nICc6JykKICAgIGN5b2EgLS1tZXRob2Qgc2FsbW9uIC0tc3BlY2llcyBoZzM4XzExMSAtLWlucHV0ICRpbnB1dHMgLS1saWJ0eXBlIENEUyAtLWpwcmVmaXggMjAKZG9uZQpjZCAkc3RhcnQKYGBgCgojIExvYWQgYW5ub3RhdGlvbnMKCmBgYHtyfQpoc19hbm5vdCA8LSBsb2FkX2Jpb21hcnRfYW5ub3RhdGlvbnMoeWVhciA9IDIwMjEsIG1vbnRoID0gMDIpCmFubm90IDwtIGhzX2Fubm90W1siZ2VuZV9hbm5vdGF0aW9ucyJdXQpgYGAKCiMgQ29sbGVjdCBwcmVwcm9jZXNzaW5nIGluZm9ybWF0aW9uCgpgYGB7cn0KbWV0YSA8LSBnYXRoZXJfcHJlcHJvY2Vzc2luZ19tZXRhZGF0YSgic2FtcGxlX3NoZWV0cy9QUkpOQTY3NTA5MC54bHN4IikKYGBgCgojIENyZWF0ZSBTRQoKYGBge3J9CnR4X2dlbmVfbWFwIDwtIGhzX2Fubm90W1siZ2VuZV90eF9tYXAiXV0KdHhfZ2VuZV9tYXBbWyJ0cmFuc2NyaXB0Il1dIDwtIGdzdWIoeCA9IHR4X2dlbmVfbWFwW1sidHJhbnNjcmlwdCJdXSwgcGF0dGVybiA9ICJcXC5cXGQrJCIsIHJlcGxhY2VtZW50ID0gIiIpCnJvd25hbWVzKHR4X2dlbmVfbWFwKSA8LSBtYWtlLm5hbWVzKGdzdWIoeCA9IHJvd25hbWVzKHR4X2dlbmVfbWFwKSwgcGF0dGVybiA9ICJcXC5cXGQrJCIsIHJlcGxhY2VtZW50ID0gIiIpLCB1bmlxdWUgPSBUUlVFKQpoc19zZV90eCA8LSBjcmVhdGVfc2UobWV0YVtbIm5ld19tZXRhIl1dLCBnZW5lX2luZm8gPSBhbm5vdCwgZmlsZV9jb2x1bW4gPSAic2FsbW9uX2NvdW50X3RhYmxlIikKaHNfc2VfZ2VuZSA8LSBjcmVhdGVfc2UobWV0YVtbIm5ld19tZXRhIl1dLCBnZW5lX2luZm8gPSBhbm5vdCwgZmlsZV9jb2x1bW4gPSAic2FsbW9uX2NvdW50X3RhYmxlIiwKICAgICAgICAgICAgICAgICAgICAgICAgdHhfZ2VuZV9tYXAgPSB0eF9nZW5lX21hcCkKCmBgYAoKIyBTZXQgY29uZGl0aW9uL2JhdGNoCgpJIGhhdmUgZXh0cmFjdGVkIHR3byBwb3RlbnRpYWxseSBpbnRlcmVzdGluZyBjb2x1bW5zIGZyb20gdGhlCm1ldGFkYXRhLCBhZnRlciB0aGF0IEkgd2lsbCBuZWVkIHRvIHJlYWQgbW9yZSBjYXJlZnVsbHkgaW4gdGhlIHBhcGVyCnRvIHRyeSB0byBnZXQgYSBzZW5zZSBvZiB3aGF0IGlzIHdoYXQuLi4KCmBgYHtyfQpoc19zZSA8LSBzZXRfc2VfY29uZGl0aW9ucyhoc19zZV9nZW5lLCBmYWN0ID0gImRvbm9yIikgJT4lCiAgc2V0X3NlX2JhdGNoZXMoZmFjdCA9ICJ0eXBlIikKCmBgYAoKIyBBIGNvdXBsZSBwbG90cwoKYGBge3J9CnBsb3RfbGVnZW5kKGhzX3NlKQpwbG90X2xpYnNpemUoaHNfc2UpCnBsb3Rfbm9uemVybyhoc19zZSkKYGBgCgpQcmVzdW1hYmx5IHdlIHdpbGwgbmVlZCB0byBzZXBhcmF0ZSB0aGUgZXhvbWUgYW5kIHJuYXNlcSBkYXRhOyBidXQgZm9yCnRoZSBtb21lbnQgSSB3aWxsIGxlYXZlIHRoZW0gdG9nZXRoZXIgdG8gc2VlIHdoYXQgSSBjYW4gc2VlLgoKYGBge3J9CmhzX25vcm0gPC0gbm9ybWFsaXplKGhzX3NlLCB0cmFuc2Zvcm0gPSAibG9nMiIsIGNvbnZlcnQgPSAiY3BtIiwgZmlsdGVyID0gVFJVRSwgbm9ybSA9ICJxdWFudCIpCnBsb3RfcGNhKGhzX25vcm0pCmhzX25iIDwtIG5vcm1hbGl6ZShoc19zZSwgIHRyYW5zZm9ybSA9ICJsb2cyIiwgY29udmVydCA9ICJjcG0iLCBmaWx0ZXIgPSBUUlVFLCBiYXRjaCA9ICJjb21iYXQiKQpwbG90X3BjYShoc19uYikKYGBgCgpPaywgbmV2ZXJtaW5kLCB0aGUgZXhvbWUgZGF0YSBpcyB0b28gZGlmZmVyZW50LgoKYGBge3J9CmhzX3JuYSA8LSBzdWJzZXRfc2UoaHNfc2UsIHN1YnNldCA9ICJiYXRjaD09J3JuYXNlcSciKQpoc19ybmFfbm9ybSA8LSBub3JtYWxpemUoaHNfcm5hLCB0cmFuc2Zvcm0gPSAibG9nMiIsIGNvbnZlcnQgPSAiY3BtIiwgZmlsdGVyID0gVFJVRSwgbm9ybSA9ICJxdWFudCIpCnBsb3RfcGNhKGhzX3JuYV9ub3JtKQpoc19ybmFfbmIgPC0gbm9ybWFsaXplKGhzX3JuYSwgdHJhbnNmb3JtID0gImxvZzIiLCBjb252ZXJ0ID0gImNwbSIsIGZpbHRlciA9IFRSVUUsIGJhdGNoID0gInN2YXNlcSIpCnBsb3RfcGNhKGhzX3JuYV9uYikKYGBgCg==