I will use this sheet for exporting data to collaborators. Currently, that is only the folks interested in doing some analyses with the first PLOS NTD paper.
It is worth noting that I recently found some errors in my accounting for these and the plosntd2 samples (I had the pathogen strains reversed).
## The biomart annotations file already exists, loading from it.
rownames(hs_annot) <- make.names(
paste0(hs_annot[["ensembl_transcript_id"]], ".",
hs_annot[["transcript_version"]]),
unique=TRUE)
hs_tx_gene <- hs_annot[, c("ensembl_gene_id", "ensembl_transcript_id")]
hs_tx_gene[["id"]] <- rownames(hs_tx_gene)
hs_tx_gene <- hs_tx_gene[, c("id", "ensembl_gene_id")]
new_hs_annot <- hs_annot
rownames(new_hs_annot) <- make.names(hs_annot[["ensembl_gene_id"]], unique=TRUE)
sample_sheet <- "sample_sheets/UMD_leishmania_host_metasheet_20200227.xlsx"
## As of 20200222, I have only performed hisat2 mapping for the plos ntd2 data.
prefix <- "excel/plos_ntd_host_hisat2-v"
plosntd1_expt <- create_expt(sample_sheet,
file_column="hsapienshisatfile",
gene_info=new_hs_annot,
savefile=glue::glue("{prefix}{ver}.rda"))
## Reading the sample metadata.
## The sample definitions comprises: 441 rows(samples) and 68 columns(metadata fields).
## Reading count tables.
## Reading count tables with read.table().
## /mnt/sshfs/cbcbsub/fs/cbcb-lab/nelsayed/scratch/atb/rnaseq/multiple_leishmania_2018/preprocessing/hpgl1108/outputs/hisat2_hg38_91/E1.count.xz contains 58307 rows.
## preprocessing/hpgl1109/outputs/hisat2_hg38_91/E2.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1110/outputs/hisat2_hg38_91/E3.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1111/outputs/hisat2_hg38_91/E4.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1112/outputs/hisat2_hg38_91/E5.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1113/outputs/hisat2_hg38_91/E6.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1114/outputs/hisat2_hg38_91/E7.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1115/outputs/hisat2_hg38_91/E8.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1116/outputs/hisat2_hg38_91/L1.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1117/outputs/hisat2_hg38_91/L2.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1118/outputs/hisat2_hg38_91/L3.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1119/outputs/hisat2_hg38_91/L4.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1120/outputs/hisat2_hg38_91/L5.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1121/outputs/hisat2_hg38_91/L6.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1122/outputs/hisat2_hg38_91/L7.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1123/outputs/hisat2_hg38_91/L8.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1124/outputs/hisat2_hg38_91/L9.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1125/outputs/hisat2_hg38_91/L10.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1126/outputs/hisat2_hg38_91/L11.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1127/outputs/hisat2_hg38_91/L12.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1128/outputs/hisat2_hg38_91/L13.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1129/outputs/hisat2_hg38_91/L14.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1130/outputs/hisat2_hg38_91/L15.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1131/outputs/hisat2_hg38_91/L16.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1132/outputs/hisat2_hg38_91/L17.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1133/outputs/hisat2_hg38_91/N1.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1134/outputs/hisat2_hg38_91/N2.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1135/outputs/hisat2_hg38_91/N3.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1136/outputs/hisat2_hg38_91/N4.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1137/outputs/hisat2_hg38_91/N5.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1138/outputs/hisat2_hg38_91/N6.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1139/outputs/hisat2_hg38_91/N7.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1140/outputs/hisat2_hg38_91/N8.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1141/outputs/hisat2_hg38_91/N9.count.xz contains 58307 rows and merges to 58307 rows.
## preprocessing/hpgl1142/outputs/hisat2_hg38_91/N10.count.xz contains 58307 rows and merges to 58307 rows.
## Finished reading count tables.
## Matched 57645 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## The final expressionset has 58302 rows and 35 columns.
written_csv <- readr::write_csv(x=as.data.frame(exprs(plosntd1_expt)),
path=glue::glue("{prefix}{ver}.csv"))
written_xls <- write_expt(plosntd1_expt,
excel=glue::glue("{prefix}{ver}.xlsx"))
## Writing the first sheet, containing a legend and some summary data.
## Writing the raw reads.
## Graphing the raw reads.
## varpart sees only 1 batch, adjusting the model accordingly.
## Attempting mixed linear model with: ~ (1|condition)
## Fitting the expressionset to the model, this is slow.
## A couple of common errors:
## An error like 'vtv downdated' may be because there are too many 0s, filter the data and rerun.
## An error like 'number of levels of each grouping factor must be < number of observations' means
## that the factor used is not appropriate for the analysis - it really only works for factors
## which are shared among multiple samples.
## Retrying with only condition in the model.
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
## contrasts can be applied only to factors with 2 or more levels
## Attempting again with only condition failed.
## Error in simple_varpart(expt, predictor = NULL, factors = c("condition", :
##
## Writing the normalized reads.
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
## contrasts can be applied only to factors with 2 or more levels
## Graphing the normalized reads.
## varpart sees only 1 batch, adjusting the model accordingly.
## Attempting mixed linear model with: ~ (1|condition)
## Fitting the expressionset to the model, this is slow.
## A couple of common errors:
## An error like 'vtv downdated' may be because there are too many 0s, filter the data and rerun.
## An error like 'number of levels of each grouping factor must be < number of observations' means
## that the factor used is not appropriate for the analysis - it really only works for factors
## which are shared among multiple samples.
## Retrying with only condition in the model.
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
## contrasts can be applied only to factors with 2 or more levels
## Attempting again with only condition failed.
## Error in simple_varpart(norm_data, predictor = NULL, factors = c("condition", :
##
## Writing the median reads by factor.
## Don't forget, using salmon, _every_ sample will be included!
salmon_prefix <- "excel/plos_ntd_host_salmon-v"
salmon_plosntd1_expt <- create_expt(sample_sheet,
file_column="hsapiensfile",
gene_info=new_hs_annot,
tx_gene_map=hs_tx_gene,
savefile=glue::glue("{salmon_prefix}{ver}.rda"))
## Reading the sample metadata.
## The sample definitions comprises: 441 rows(samples) and 68 columns(metadata fields).
## Reading count tables.
## Using the transcript to gene mapping.
## Reading salmon data with tximport.
## Finished reading count tables.
## Matched 19629 annotations and counts.
## Bringing together the count matrix and gene information.
## The mapped IDs are not the rownames of your gene information, changing them now.
## Some annotations were lost in merging, setting them to 'undefined'.
## Warning in create_expt(sample_sheet, file_column = "hsapiensfile",
## gene_info = new_hs_annot, : The following samples have no counts!
## hpgl0721hpgl0722hpgl0723hpgl0724
## The final expressionset has 19629 rows and 271 columns.
## Using a subset expression.
## There were 271, now there are 35 samples.
save_result <- try(save(salmon_plosntd1_expt, file=paste0(salmon_prefix, ver, ".rda")))
salmon_written_csv <- readr::write_csv(x=as.data.frame(exprs(salmon_plosntd1_expt)),
path=glue::glue("{salmon_prefix}{ver}.csv"))
salmon_written_xls <- write_expt(salmon_plosntd1_expt,
excel=glue::glue("{salmon_prefix}{ver}.xlsx"))
## Writing the first sheet, containing a legend and some summary data.
## Writing the raw reads.
## Graphing the raw reads.
## varpart sees only 1 batch, adjusting the model accordingly.
## Attempting mixed linear model with: ~ (1|condition)
## Fitting the expressionset to the model, this is slow.
## A couple of common errors:
## An error like 'vtv downdated' may be because there are too many 0s, filter the data and rerun.
## An error like 'number of levels of each grouping factor must be < number of observations' means
## that the factor used is not appropriate for the analysis - it really only works for factors
## which are shared among multiple samples.
## Retrying with only condition in the model.
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
## contrasts can be applied only to factors with 2 or more levels
## Attempting again with only condition failed.
## Error in simple_varpart(expt, predictor = NULL, factors = c("condition", :
##
## Writing the normalized reads.
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
## contrasts can be applied only to factors with 2 or more levels
## Graphing the normalized reads.
## varpart sees only 1 batch, adjusting the model accordingly.
## Attempting mixed linear model with: ~ (1|condition)
## Fitting the expressionset to the model, this is slow.
## A couple of common errors:
## An error like 'vtv downdated' may be because there are too many 0s, filter the data and rerun.
## An error like 'number of levels of each grouping factor must be < number of observations' means
## that the factor used is not appropriate for the analysis - it really only works for factors
## which are shared among multiple samples.
## Retrying with only condition in the model.
## Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
## contrasts can be applied only to factors with 2 or more levels
## Attempting again with only condition failed.
## Error in simple_varpart(norm_data, predictor = NULL, factors = c("condition", :
##
## Writing the median reads by factor.
## [1] "expt" "fun" "hs_annot"
## [4] "hs_tx_gene" "new_hs_annot" "old_options"
## [7] "plosntd1_expt" "prefix" "previous_file"
## [10] "rundate" "salmon_plosntd1_expt" "salmon_prefix"
## [13] "salmon_written_csv" "salmon_written_xls" "sample_sheet"
## [16] "save_result" "tt" "ver"
## [19] "written_csv" "written_xls"
## Length Class Mode
## title 1 -none- character
## notes 1 -none- character
## initial_metadata 69 data.frame list
## expressionset 1 ExpressionSet S4
## design 69 data.frame list
## conditions 35 factor numeric
## batches 35 factor numeric
## samplenames 35 -none- character
## colors 35 -none- character
## state 5 -none- list
## libsize 35 -none- numeric
## hpgl1108 hpgl1109 hpgl1110 hpgl1111 hpgl1112 hpgl1113 hpgl1114
## ENSG00000000003 1017 742 742 1585 871 1697 835
## hpgl1115 hpgl1116 hpgl1117 hpgl1118 hpgl1119 hpgl1120 hpgl1121
## ENSG00000000003 383 652 934 445 428 424 592
## hpgl1122 hpgl1123 hpgl1124 hpgl1125 hpgl1126 hpgl1127 hpgl1128
## ENSG00000000003 192 117 439 1009 664 971 889
## hpgl1129 hpgl1130 hpgl1131 hpgl1132 hpgl1133 hpgl1134 hpgl1135
## ENSG00000000003 396 653 408 445 4579 3436 954
## hpgl1136 hpgl1137 hpgl1138 hpgl1139 hpgl1140 hpgl1141 hpgl1142
## ENSG00000000003 1021 1126 1126 1120 687 900 1508
## sampleid study lab host hoststrain hostcelltype
## hpgl1108 hpgl1108 plosntd1 mosser homo_sapiens <NA> skin
## hostcellsource infectstate differentiationmethod stimulation
## hpgl1108 <NA> yes <NA> <NA>
## pathogenspecies pathogenstrain pathogenstage expttime moi
## hpgl1108 lbraziliensis <NA> amastigote t21d NA
## numberparasitecells numberhostcells infectstateold donor celltype
## hpgl1108 NA NA infected <NA> skin
## state studybatch skipped host1 pathogenspecies1
## hpgl1108 lb_infected <NA> no homo_sapiens lbraziliensis
## hsapiensfile
## hpgl1108 preprocessing/hpgl1108/outputs/salmon_hg38_91/quant.sf
## hsapienshisatfile
## hpgl1108 preprocessing/hpgl1108/outputs/hisat2_hg38_91/E1.count.xz
## mmusculusfile lmajorfile
## hpgl1108 <NA> <NA>
## lmexicanafile
## hpgl1108 preprocessing/hpgl1108/outputs/salmon_lmexicana_v36/quant.sf
## lpanamensisfile
## hpgl1108 <NA>
## lbraziliensisfile
## hpgl1108 preprocessing/hpgl1108/outputs/salmon_lbraziliensis_v37/quant.sf
## lamazonensisfile samplename infectionstatus samplealias
## hpgl1108 <NA> 9224523042_I early infection E1
## parasitedetectionstatus lesionsizemm illnessdurationdays patientage
## hpgl1108 negative 36 21 44
## patientsex experimentalias tubelabel tubealias expperson exptdate
## hpgl1108 F <NA> <NA> <NA> <NA> NA
## cellsperwell infectionperiod media parasiteenrichment
## hpgl1108 NA <NA> <NA> <NA>
## parasitesperinfectedcell parasitesper100cells percentinfectedcells
## hpgl1108 <NA> NA <NA>
## parasitecellrange rnangul libraryconstruction sraaccession condition
## hpgl1108 <NA> NA <NA> SRR3162852 skin
## batch notes totalreads trimmedreads mappedhost mappedparasite
## hpgl1108 medimmune <NA> NA 35419185 12379219 399150
## host2 percentmappedhost percentmappedparasite sampleid.1 file
## hpgl1108 homo_sapiens 0.3495 0.01127 hpgl1108 null
## ensembl_transcript_id ensembl_gene_id version
## ENSG00000000003 ENST00000373020 ENSG00000000003 14
## transcript_version hgnc_symbol
## ENSG00000000003 8 TSPAN6
## description
## ENSG00000000003 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## gene_biotype cds_length chromosome_name strand start_position
## ENSG00000000003 protein_coding 738 X - 100627109
## end_position
## ENSG00000000003 100639991