I will use this sheet for exporting data to collaborators. Currently, that is only the folks interested in doing some analyses with the first PLOS NTD paper.
It is worth noting that I recently found some errors in my accounting for these and the plosntd2 samples (I had the pathogen strains reversed).
## Successfully connected to the hsapiens_gene_ensembl database.
## Cache found
## Finished downloading ensembl gene annotations.
## Cache found
## Finished downloading ensembl structure annotations.
## Dropping haplotype chromosome annotations, set drop_haplotypes=FALSE if this is bad.
## Saving annotations to hsapiens_biomart_annotations.rda.
## Finished save().
rownames(hs_annot) <- make.names(
paste0(hs_annot[["ensembl_transcript_id"]], ".",
hs_annot[["transcript_version"]]),
unique=TRUE)
hs_tx_gene <- hs_annot[, c("ensembl_gene_id", "ensembl_transcript_id")]
hs_tx_gene[["id"]] <- rownames(hs_tx_gene)
hs_tx_gene <- hs_tx_gene[, c("id", "ensembl_gene_id")]
new_hs_annot <- hs_annot
rownames(new_hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique=TRUE)
Note that I need to overwrite the savefile of the annotations and not use the archive server, since 38v99 came out in 202001.
sample_sheet <- glue::glue("sample_sheets/UMD_leishmania_host_metasheet_{ver}.xlsx")
## As of 20200222, I have only performed hisat2 mapping for the plos ntd2 data.
prefix <- "excel/plos_ntd_host_hisat2-v"
plosntd1_expt <- create_expt(metadata=sample_sheet,
file_column="hg3891hisat2",
gene_info=new_hs_annot,
savefile=glue::glue("{prefix}{ver}.rda"))
## Reading the sample metadata.
## Dropped 2 rows from the sample metadata because they were blank.
## The sample definitions comprises: 441 rows(samples) and 68 columns(metadata fields).
## Reading count tables.
## Reading count tables with read.table().
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl0725/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows.
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl0726/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl0727/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl0728/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl0729/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl0730/outputs/hisat2_hg38_91/forward.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1108/outputs/hisat2_hg38_91/E1.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1109/outputs/hisat2_hg38_91/E2.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1110/outputs/hisat2_hg38_91/E3.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1111/outputs/hisat2_hg38_91/E4.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1112/outputs/hisat2_hg38_91/E5.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1113/outputs/hisat2_hg38_91/E6.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1114/outputs/hisat2_hg38_91/E7.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1115/outputs/hisat2_hg38_91/E8.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1116/outputs/hisat2_hg38_91/L1.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1117/outputs/hisat2_hg38_91/L2.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1118/outputs/hisat2_hg38_91/L3.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1119/outputs/hisat2_hg38_91/L4.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1120/outputs/hisat2_hg38_91/L5.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1121/outputs/hisat2_hg38_91/L6.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1122/outputs/hisat2_hg38_91/L7.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1123/outputs/hisat2_hg38_91/L8.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1124/outputs/hisat2_hg38_91/L9.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1125/outputs/hisat2_hg38_91/L10.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1126/outputs/hisat2_hg38_91/L11.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1127/outputs/hisat2_hg38_91/L12.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1128/outputs/hisat2_hg38_91/L13.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1129/outputs/hisat2_hg38_91/L14.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1130/outputs/hisat2_hg38_91/L15.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1131/outputs/hisat2_hg38_91/L16.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1132/outputs/hisat2_hg38_91/L17.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1133/outputs/hisat2_hg38_91/N1.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1134/outputs/hisat2_hg38_91/N2.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1135/outputs/hisat2_hg38_91/N3.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1136/outputs/hisat2_hg38_91/N4.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1137/outputs/hisat2_hg38_91/N5.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1138/outputs/hisat2_hg38_91/N6.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1139/outputs/hisat2_hg38_91/N7.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1140/outputs/hisat2_hg38_91/N8.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1141/outputs/hisat2_hg38_91/N9.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1142/outputs/hisat2_hg38_91/N10.count.xz contains 58307 rows and merges to 58307 rows.
## Finished reading count tables.
## Matched 57755 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## The final expressionset has 58302 rows and 41 columns.
plosntd1_expt <- set_expt_conditions(plosntd1_expt, fact="infectionstatus")
written_csv <- readr::write_csv(x=as.data.frame(exprs(plosntd1_expt)),
path=glue::glue("{prefix}{ver}.csv"))
written_xls <- write_expt(plosntd1_expt,
excel=glue::glue("{prefix}{ver}.xlsx"))
## Writing the first sheet, containing a legend and some summary data.
## Writing the raw reads.
## Graphing the raw reads.
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Attempting mixed linear model with: ~ (1|condition) + (1|batch)
## Fitting the expressionset to the model, this is slow.
## Dividing work into 100 chunks...
##
## Total:184 s
## Placing factor: condition at the beginning of the model.
## Writing the normalized reads.
## Graphing the normalized reads.
## Attempting mixed linear model with: ~ (1|condition) + (1|batch)
## Fitting the expressionset to the model, this is slow.
## Dividing work into 100 chunks...
##
## Total:154 s
## Placing factor: condition at the beginning of the model.
## Writing the median reads by factor.
salmon_prefix <- "excel/plos_ntd_host_salmon-v"
salmon_plosntd1_expt <- create_expt(sample_sheet,
file_column="hg3899salmon",
gene_info=new_hs_annot,
tx_gene_map=hs_tx_gene,
savefile=glue::glue("{salmon_prefix}{ver}.rda"))
## Reading the sample metadata.
## Dropped 2 rows from the sample metadata because they were blank.
## The sample definitions comprises: 441 rows(samples) and 68 columns(metadata fields).
## Reading count tables.
## Using the transcript to gene mapping.
## Reading salmon data with tximport.
## Finished reading count tables.
## Matched 19999 annotations and counts.
## Bringing together the count matrix and gene information.
## The mapped IDs are not the rownames of your gene information, changing them now.
## Some annotations were lost in merging, setting them to 'undefined'.
## The final expressionset has 19999 rows and 41 columns.
salmon_plosntd1_expt <- set_expt_conditions(salmon_plosntd1_expt, fact="infectionstatus")
save_result <- try(save(salmon_plosntd1_expt, file=paste0(salmon_prefix, ver, ".rda")))
salmon_written_csv <- readr::write_csv(x=as.data.frame(exprs(salmon_plosntd1_expt)),
path=glue::glue("{salmon_prefix}{ver}.csv"))
salmon_written_xls <- write_expt(salmon_plosntd1_expt,
excel=glue::glue("{salmon_prefix}{ver}.xlsx"))
## Writing the first sheet, containing a legend and some summary data.
## Writing the raw reads.
## Graphing the raw reads.
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure
## Attempting mixed linear model with: ~ (1|condition) + (1|batch)
## Fitting the expressionset to the model, this is slow.
## Dividing work into 100 chunks...
##
## Total:96 s
## Placing factor: condition at the beginning of the model.
## Writing the normalized reads.
## Graphing the normalized reads.
## Attempting mixed linear model with: ~ (1|condition) + (1|batch)
## Fitting the expressionset to the model, this is slow.
## Dividing work into 100 chunks...
##
## Total:118 s
## Placing factor: condition at the beginning of the model.
## Writing the median reads by factor.
## [1] "expt" "fun" "hs_annot"
## [4] "hs_tx_gene" "new_hs_annot" "old_options"
## [7] "plosntd1_expt" "prefix" "previous_file"
## [10] "rundate" "salmon_plosntd1_expt" "salmon_prefix"
## [13] "salmon_written_csv" "salmon_written_xls" "sample_sheet"
## [16] "save_result" "tt" "ver"
## [19] "written_csv" "written_xls"
## Length Class Mode
## title 1 -none- character
## notes 1 -none- character
## initial_metadata 69 data.frame list
## expressionset 1 ExpressionSet S4
## design 69 data.frame list
## conditions 41 factor numeric
## batches 41 factor numeric
## samplenames 41 -none- character
## colors 41 -none- character
## state 5 -none- list
## libsize 41 -none- numeric
## hpgl0725 hpgl0726 hpgl0727 hpgl0728 hpgl0729 hpgl0730 hpgl1108
## ENSG00000000003 150 338 387 681 787 458 1017
## hpgl1109 hpgl1110 hpgl1111 hpgl1112 hpgl1113 hpgl1114 hpgl1115
## ENSG00000000003 742 742 1585 871 1697 835 383
## hpgl1116 hpgl1117 hpgl1118 hpgl1119 hpgl1120 hpgl1121 hpgl1122
## ENSG00000000003 652 934 445 428 424 592 192
## hpgl1123 hpgl1124 hpgl1125 hpgl1126 hpgl1127 hpgl1128 hpgl1129
## ENSG00000000003 117 439 1009 664 971 889 396
## hpgl1130 hpgl1131 hpgl1132 hpgl1133 hpgl1134 hpgl1135 hpgl1136
## ENSG00000000003 653 408 445 4579 3436 954 1021
## hpgl1137 hpgl1138 hpgl1139 hpgl1140 hpgl1141 hpgl1142
## ENSG00000000003 1126 1126 1120 687 900 1508
## sampleid study lab host hoststrain hostcelltype
## hpgl0725 hpgl0725 plosntd2 mosser homo_sapiens <NA> skin
## hostcellsource infectstate differentiationmethod stimulation
## hpgl0725 <NA> yes <NA> <NA>
## pathogenspecies pathogenstrain pathogenstage expttime moi
## hpgl0725 lamazonensis <NA> amastigote undefined NA
## numberparasitecells numberhostcells infectstateold donor celltype
## hpgl0725 NA NA infected <NA> skin
## state studybatch skipped host1 pathogenspecies1
## hpgl0725 la_infected <NA> no homo_sapiens lamazonensis
## hg3891salmon
## hpgl0725 preprocessing/hpgl0725/outputs/salmon_hg38_91/quant.sf
## hg3899salmon
## hpgl0725 preprocessing/hpgl0725/outputs/salmon_hg38_99/quant.sf
## hg3891hisat2
## hpgl0725 preprocessing/hpgl0725/outputs/hisat2_hg38_91/forward.count.xz
## mmusculusfile lmajorfile
## hpgl0725 <NA> <NA>
## lmexicanafile
## hpgl0725 preprocessing/hpgl0725/outputs/salmon_lmexicana_v36/quant.sf
## lpanamensisfile lbraziliensisfile
## hpgl0725 <NA> <NA>
## lamazonensisfile
## hpgl0725 preprocessing/hpgl0725/outputs/salmon_lamazonensis_v44/quant.sf
## samplename infectionstatus samplealias parasitedetectionstatus
## hpgl0725 <NA> diffuse <NA> <NA>
## lesionsizemm illnessdurationdays patientage patientsex experimentalias
## hpgl0725 NA 22y 50 F <NA>
## tubelabel tubealias expperson exptdate cellsperwell infectionperiod
## hpgl0725 <NA> <NA> <NA> NA NA <NA>
## media parasiteenrichment parasitesperinfectedcell parasitesper100cells
## hpgl0725 <NA> <NA> <NA> NA
## percentinfectedcells parasitecellrange rnangul libraryconstruction
## hpgl0725 <NA> <NA> NA <NA>
## sraaccession condition batch notes totalreads trimmedreads
## hpgl0725 SRR7275003 skin plosntd2 <NA> NA 60200944
## mappedhost mappedparasite host2 percentmappedhost
## hpgl0725 28558522 7742236 homo_sapiens 0.4744
## percentmappedparasite file
## hpgl0725 0.1286 null
## ensembl_transcript_id ensembl_gene_id version
## ENSG00000000003 ENST00000373020 ENSG00000000003 15
## transcript_version hgnc_symbol
## ENSG00000000003 9 TSPAN6
## description
## ENSG00000000003 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## gene_biotype cds_length chromosome_name strand start_position
## ENSG00000000003 protein_coding 738 X - 100627108
## end_position
## ENSG00000000003 100639991