| batch | condition | media | treatment | induced | exp_12 | |
|---|---|---|---|---|---|---|
| SK064 | 1 | LB-IPTG | LB | DMSO | yes | control |
| SK065 | 2 | LB-IPTG | LB | DMSO | yes | control |
| SK066 | 3 | LB-IPTG | LB | DMSO | yes | control |
| SK085 | 1 | SCFM-IPTG | SCFM | DMSO | yes | treatment |
| SK086 | 2 | SCFM-IPTG | SCFM | DMSO | yes | treatment |
| SK087 | 3 | SCFM-IPTG | SCFM | DMSO | yes | treatment |
## batch condition media treatment induced exp_12
## SK064 1 LB-IPTG LB DMSO yes control
## SK065 2 LB-IPTG LB DMSO yes control
## SK066 3 LB-IPTG LB DMSO yes control
## SK085 1 SCFM-IPTG SCFM DMSO yes treatment
## SK086 2 SCFM-IPTG SCFM DMSO yes treatment
## SK087 3 SCFM-IPTG SCFM DMSO yes treatment
## [1] "The control is: LB-IPTG"
## [1] "The treatment is: SCFM-IPTG"
# Load raw count data
rawcounts <- read.csv("../../raw-data/htseq_SK_unfiltered_reverse_strand_count.csv",
row.names = 1, header = TRUE)
# Append a prefix to the batch and run columns to turn to character class
metadata[["batch"]] <- paste0("b", metadata[["batch"]])
metadata[["run"]] <- paste0("r", metadata[["run"]])
# Subset the rawcounts dataframe to only include samples that are present in experiment
samples <- rownames(metadata)
rawcounts <- select(rawcounts, all_of(samples))
# Ensure that the order of samples in metadata matches the order of columns in rawcounts
all(rownames(metadata) == colnames(rawcounts)) # Should return TRUE if order is correct## [1] TRUE
# Define the design formula for DESeq2, including main condition and batch effect
design_formula <- ~ exp_12 + batch
# Create DESeq2 dataset object from the filtered count matrix and metadata
dds <- DESeqDataSetFromMatrix(
countData = rawcounts,
colData = metadata,
design = design_formula
)## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# Sum up the counts for each sample
rawcounts_sums <- colSums(rawcounts)
# Turn it into a data frame
data <- data.frame(
Category = names(rawcounts_sums),
Count = as.numeric(rawcounts_sums)
)
# Make sure sample order is consistent
data <- data %>%
mutate(Category = factor(Category, levels = sort(unique(Category))))
# Plot bar chart of library sizes
p <- ggplot(data, aes(x = Category, y = Count, fill = Category)) +
geom_bar(stat = "identity") +
# coord_flip() # flip if too many samples to read labels easily
labs(y = "Library size (M)", x = NULL) + # remove x label, rename y axis
# scale_fill_brewer(palette = "Dark2") + # optional color palette
scale_y_continuous(
labels = scales::label_number(scale = 1e-6), # show counts in millions
limits = c(0, (max(data$Count) + 1e6)) # add a bit of space on top
) +
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
axis.text = element_text(size = text.size, angle = 45, hjust = 1), # slant labels
axis.title = element_text(size = axis.text.size),
legend.text = element_text(size = text.size),
legend.title = element_text(size = axis.text.size),
panel.border = element_rect(fill = NA),
legend.position = "none", # no legend needed since each bar is a different sample
axis.ticks = element_line(linewidth = 0.2),
legend.key = element_blank()
)
# Show the plot
p# Save the plot as a PDF with today’s date and project name
cairo_pdf(paste0(Sys.Date(), "_library_size_", exp, "_", treatment, ".vs.", control, ".pdf"))
p
dev.off() ## quartz_off_screen
## 2
# Stack rawcounts into single vector
dat <- stack(rawcounts)
# Format expression distribution plot with the expression level and density
expression_dist <- ggplot(dat, aes(x = log2(values+1), fill=ind, label = ind )) +
geom_density(alpha = 0.5, position = "identity") +
ylab("Density") +
geom_textdensity(hjust = "ymax", vjust = 0,
text_only = TRUE, text_smoothing = 20) +
theme(panel.grid = element_blank(),
panel.background = element_blank(),
axis.text = element_text(size = text.size),
axis.title = element_text(size = axis.text.size),
legend.text = element_text(size = text.size),
legend.title = element_text(size = axis.text.size),
panel.border = element_rect(fill = NA),
legend.position = "none",
axis.ticks = element_line(linewidth = 0.2),
legend.key = element_blank()) +
coord_cartesian(xlim = c(0, 15))
expression_dist#save figure
cairo_pdf(paste0(Sys.Date(), "_expression_dist_", exp, "_", treatment, ".vs.", control, ".pdf"))
expression_dist
dev.off() ## quartz_off_screen
## 2
# Prepare data by getting library sizes and non-zero counts
x <- colSums(rawcounts)
y <- colSums(rawcounts != 0)
df <- data.frame(x,y)
df$ind <- rownames(df)
# Calculate axis limits
x_min <- min(df$x)
x_max <- max(df$x)
y_min <- min(df$y)
y_max <- max(df$y)
# Non-zero plot
nonzero <- ggplot(data = df, aes(x = x, y = y, color = ind, label = ind)) +
geom_point(size = 3) +
geom_text(hjust = -0.1, vjust = -0.1, color = "black", size = 5) +
scale_y_continuous(limits=c(y_min, y_max)) +
scale_x_continuous(
limits = c(x_min-1e3, x_max+9e5),
labels = scales::comma_format(scale = 1e-6, accuracy = 1) # Format labels without decimals
) +
xlab("Millions of reads") + ylab("Number of non-zero genes observed") +
labs(color = "Sample") +
theme(panel.grid = element_blank(),
panel.background = element_blank(),
axis.text = element_text(size = text.size),
axis.title = element_text(size = axis.text.size),
legend.text = element_text(size = text.size),
legend.title = element_text(size = axis.text.size),
panel.border = element_rect(fill = NA),
legend.position = "none",
axis.ticks = element_line(linewidth = 0.2),
legend.key = element_blank()) +
coord_cartesian()
# Show nonzero plot
nonzero# Save plot
cairo_pdf(paste0(Sys.Date(), "_non-zero_", exp, "_", treatment, ".vs.", control, ".pdf"))
nonzero
dev.off() ## quartz_off_screen
## 2
# Load GFF annotation
gff_annot <- load_gff_annotations("../../raw-data/paeruginosa_pa14.gff",
id_col = "gene_id", type = "gene")## Returning a df with 16 columns and 5979 rows.
## seqnames start end width strand source type score phase
## gene1650835 chromosome 483 2027 1545 + PseudoCAP gene NA 0
## gene1650837 chromosome 2056 3159 1104 + PseudoCAP gene NA 0
## gene1650839 chromosome 3169 4278 1110 + PseudoCAP gene NA 0
## gene1650841 chromosome 4275 6695 2421 + PseudoCAP gene NA 0
## gene1650843 chromosome 7018 7791 774 - PseudoCAP gene NA 0
## gene1650845 chromosome 7803 8339 537 - PseudoCAP gene NA 0
## gene_id Name Dbxref Alias name Parent locus
## gene1650835 gene1650835 GeneID:4384099 PA14_00010 dnaA <NA>
## gene1650837 gene1650837 GeneID:4384100 PA14_00020 dnaN <NA>
## gene1650839 gene1650839 GeneID:4384101 PA14_00030 recF <NA>
## gene1650841 gene1650841 GeneID:4384102 PA14_00050 gyrB <NA>
## gene1650843 gene1650843 GeneID:4385186 PA14_00060 PA14_00060 <NA>
## gene1650845 gene1650845 GeneID:4385187 PA14_00070 PA14_00070 <NA>
## SK064 SK065 SK066 SK085 SK086 SK087
## gene1650835 2234 2528 1719 1579 1694 1158
## gene1650837 1266 1754 1368 1732 2280 1356
## gene1650839 722 770 657 377 454 293
## gene1650841 3671 4307 3347 1914 2500 1745
## gene1650843 436 435 313 292 389 236
## gene1650845 257 299 177 177 180 124
## [1] 5979 16
## [1] 5979 6
## Mode FALSE TRUE
## logical 932 5047
## Mode TRUE
## logical 5979
# Create an expression object with counts, metadata, and gene info
expt <- create_expt(count_dataframe = rawcounts,
metadata = metadata,
gene_info = gff_annot)## Reading the sample metadata.
## Did not find the column: sampleid.
## The rownames do not appear numeric, using them.
## The sample definitions comprises: 6 rows(samples) and 8 columns(metadata fields).
## Matched 5979 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 5979 features and 6 samples.
# Normalize counts log2 CPM
cpm_normalized_exp <- normalize_expt(expt,
transform = "log2",
norm = "raw",
convert = "cpm",
row_min = "10")## transform_counts: Found 273 values equal to 0, adding 1 to the matrix.
####### Calculate RPKM ########
# Grab gene lengths from GFF (should be in base pairs)
gene_lengths <- gff_annot$width
names(gene_lengths) <- rownames(gff_annot) # set gene names
# Make sure gene_lengths are in the same order as rawcounts rows
gene_lengths <- gene_lengths[rownames(rawcounts)]
# Make DGEList object for edgeR
dge <- DGEList(counts = rawcounts)
# Calculate RPKM (reads per kilobase per million)
rpkm <- rpkm(dge, gene.length = gene_lengths)
# Wrap RPKM in an experiment object too
expt_rpkm <- create_expt(count_dataframe = rpkm,
metadata = metadata,
gene_info = gff_annot)## Reading the sample metadata.
## Did not find the column: sampleid.
## The rownames do not appear numeric, using them.
## The sample definitions comprises: 6 rows(samples) and 8 columns(metadata fields).
## Matched 5979 annotations and counts.
## Bringing together the count matrix and gene information.
## Some annotations were lost in merging, setting them to 'undefined'.
## Saving the expressionset to 'expt.rda'.
## The final expressionset has 5979 features and 6 samples.
# log2 transform the RPKM (again, no normalization beyond log)
rpkm_normalized_exp <- normalize_expt(expt_rpkm,
transform = "log2",
norm = "raw")## transform_counts: Found 273 values equal to 0, adding 1 to the matrix.
# Get the actual expression matrices from the experiment objects
rpkm <- exprs(rpkm_normalized_exp)
cpm <- exprs(cpm_normalized_exp)
# Merge CPM and RPKM matrices with gene annotation info
cpm_annotated <- merge(gff_annot, cpm, by = 0)
rpkm_annotated <- merge(gff_annot, rpkm, by = 0)This generates a small heatmap of the alginate genes across samples to visualize expression.
# Loop through CPM and RPKM to plot both heatmaps
for (type in c("cpm", "rpkm")) {
# Define gene list
alg_operon_genes <- c(
"algA", "algF", "algJ", "algI", "algL", "algX", "algG", "algE",
"algK", "alg44", "alg8", "algD", "mucC", "mucB", "algU", "algW",
"algP", "algQ", "algR", "algZ", "algC", "algB"
)
# Get the right data per type of transformation
data_annot <- get(paste0(type, "_annotated"))
# Fix rownames using gene name (make unique)
rownames(data_annot) <- make.names(data_annot$name, unique = TRUE)
# Drop undesired metadata columns
cols_to_exclude <- c("Row.names","seqnames","start","end","width",
"strand","source","type","score","phase",
"gene_id","Name","Dbxref","Alias","name",
"Parent","locus")
keep <- !names(data_annot) %in% cols_to_exclude
expr_data <- data_annot[, keep]
# Extract matrix for alg genes
alg_genes <- expr_data[alg_operon_genes, , drop = FALSE]
mat <- alg_genes - rowMeans(alg_genes)
# Pull sample annotations
anno <- as.data.frame(metadata[, c("condition", "induced")])
colnames(anno) <- c("Condition", "Induced")
# Color settings for annotation
# ann_colors <- list(
# Condition = c("LB-DMSO" = "#D95F02", "LB-IPTG" = "#66A61E"),
# Induced = c("no" = "lightgrey", "yes" = "gray27")
# )
# Scale data by row (Z-score per gene)
scaled_mat <- t(scale(t(mat)))
# Build heatmap
ht <- Heatmap(
scaled_mat,
name = "Z-score",
top_annotation = HeatmapAnnotation(df = anno#,
#col = ann_colors
),
cluster_rows = TRUE,
cluster_columns = TRUE,
row_names_side = "left",
column_names_side = "bottom",
column_names_rot = 45,
row_names_gp = gpar(fontsize = 10),
column_names_gp = gpar(fontsize = 10),
show_row_dend = TRUE,
show_column_dend = TRUE
)
# Draw and save
draw(ht)
cairo_pdf(paste0(Sys.Date(), "_alg_heatmap_", type, "_", exp, "_", treatment, ".vs.", control, ".pdf"))
draw(ht)
dev.off()
}# Create DESeqDataSet
dds <- DESeqDataSetFromMatrix(
countData = rawcounts,
colData = metadata,
design = design_formula
)## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## [1] "Number of genes before filtering:"
## [1] 5979
# Filter: keep genes with counts >=10 in at least 2 samples
print("Filtering genes with counts >=10 in at least 2 samples")## [1] "Filtering genes with counts >=10 in at least 2 samples"
keep <- rowSums(counts(dds) >= 10) >= 2
dds_filtered <- dds[keep, ]
# Check number of genes after filtering
print("Number of genes after filtering:")## [1] "Number of genes after filtering:"
## [1] 5787
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## [1] "Intercept" "exp_12_treatment_vs_control"
## [3] "batch_b2_vs_b1" "batch_b3_vs_b1"
res <- results(dds_final, contrast = c("exp_12", "treatment", "control"))
resdf <- as.data.frame(res)
resdf$gene_id <- rownames(resdf)
cpm <- as.data.frame(cpm)
rpkm <- as.data.frame(rpkm)
# append type of value to the dataframe
colnames(rpkm) <- paste0(colnames(rpkm), "_rpkm")
colnames(cpm) <- paste0(colnames(cpm), "_cpm")
rawcounts2 <- rawcounts
colnames(rawcounts2) <- paste0(colnames(rawcounts2), "_rawcounts")
# ensure column gene_id is there
rpkm$gene_id <- rownames(rpkm)
cpm$gene_id <- rownames(cpm)
rawcounts2$gene_id <- rownames(rawcounts2)
gff_annot <- gff_annot[rownames(resdf),]
dim(resdf)## [1] 5787 7
# merge with siggenes
resdf <- full_join(resdf, gff_annot, by = "gene_id", copy = FALSE)
head(resdf)## baseMean log2FoldChange lfcSE stat pvalue padj
## 1 1780.6985 -0.4694423 0.1477369 -3.177557 1.485214e-03 3.361335e-03
## 2 1603.8835 0.3530781 0.1607698 2.196171 2.807968e-02 4.966281e-02
## 3 535.8251 -0.8730243 0.1754552 -4.975767 6.498978e-07 2.238666e-06
## 4 2859.9504 -0.8078822 0.1414506 -5.711408 1.120450e-08 4.540645e-08
## 5 342.5872 -0.3013717 0.1903226 -1.583479 1.133124e-01 1.690484e-01
## 6 197.1930 -0.5169827 0.2218758 -2.330054 1.980329e-02 3.632357e-02
## gene_id seqnames start end width strand source type score phase
## 1 gene1650835 chromosome 483 2027 1545 + PseudoCAP gene NA 0
## 2 gene1650837 chromosome 2056 3159 1104 + PseudoCAP gene NA 0
## 3 gene1650839 chromosome 3169 4278 1110 + PseudoCAP gene NA 0
## 4 gene1650841 chromosome 4275 6695 2421 + PseudoCAP gene NA 0
## 5 gene1650843 chromosome 7018 7791 774 - PseudoCAP gene NA 0
## 6 gene1650845 chromosome 7803 8339 537 - PseudoCAP gene NA 0
## Name Dbxref Alias name Parent locus
## 1 GeneID:4384099 PA14_00010 dnaA <NA>
## 2 GeneID:4384100 PA14_00020 dnaN <NA>
## 3 GeneID:4384101 PA14_00030 recF <NA>
## 4 GeneID:4384102 PA14_00050 gyrB <NA>
## 5 GeneID:4385186 PA14_00060 PA14_00060 <NA>
## 6 GeneID:4385187 PA14_00070 PA14_00070 <NA>
## locusId accession GI scaffoldId start stop strand Alias
## 1 2194572 YP_788156.1 116053721 4582 483 2027 + PA14_00010
## 2 2194573 YP_788157.1 116053722 4582 2056 3159 + PA14_00020
## 3 2194574 YP_788158.1 116053723 4582 3169 4278 + PA14_00030
## 4 2194575 YP_788159.1 116053724 4582 4275 6695 + PA14_00050
## 5 2194576 YP_788160.1 116053725 4582 7791 7018 - PA14_00060
## 6 2194577 YP_788161.1 116053726 4582 8339 7803 - PA14_00070
## name desc COG
## 1 dnaA chromosomal replication initiator protein DnaA (NCBI) COG593
## 2 dnaN DNA polymerase III, beta chain (NCBI) COG592
## 3 recF DNA replication and repair protein RecF (NCBI) COG1195
## 4 gyrB DNA gyrase subunit B (NCBI) COG187
## 5 PA14_00060 putative acyltransferase (NCBI) COG204
## 6 PA14_00070 putative histidinol-phosphatase (NCBI) COG241
## COGFun
## 1 L
## 2 L
## 3 L
## 4 L
## 5 I
## 6 E
## COGDesc
## 1 ATPase involved in DNA replication initiation
## 2 DNA polymerase sliding clamp subunit (PCNA homolog)
## 3 Recombinational DNA repair ATPase (RecF pathway)
## 4 Type IIA topoisomerase (DNA gyrase/topo II, topoisomerase IV), B subunit
## 5 1-acyl-sn-glycerol-3-phosphate acyltransferase
## 6 Histidinol phosphatase and related phosphatases
## TIGRFam
## 1 TIGR00362 chromosomal replication initiator protein DnaA [dnaA]
## 2 TIGR00663 DNA polymerase III, beta subunit [dnaN]
## 3 TIGR00611 DNA replication and repair protein RecF [recF]
## 4 TIGR01059 DNA gyrase, B subunit [gyrB]
## 5
## 6 TIGR01656 histidinol-phosphate phosphatase domain,TIGR01662 HAD hydrolase, family IIIA
## TIGRRoles
## 1 DNA metabolism:DNA replication, recombination, and repair
## 2 DNA metabolism:DNA replication, recombination, and repair
## 3 DNA metabolism:DNA replication, recombination, and repair
## 4 DNA metabolism:DNA replication, recombination, and repair
## 5
## 6 Unknown function:Enzymes of unknown specificity
## GO
## 1 GO:0006270,GO:0006275,GO:0003688,GO:0017111,GO:0005524
## 2 GO:0006260,GO:0003677,GO:0003893,GO:0008408,GO:0016449,GO:0019984,GO:0003889,GO:0003894,GO:0015999,GO:0016450,GO:0003890,GO:0003895,GO:0016000,GO:0016451,GO:0003891,GO:0016448,GO:0016452
## 3 GO:0006281,GO:0005694,GO:0005524,GO:0017111,GO:0003697
## 4 GO:0006304,GO:0006265,GO:0005694,GO:0003918,GO:0005524
## 5 GO:0008152,GO:0003841
## 6 GO:0000105,GO:0004401
## EC ECDesc
## 1
## 2 2.7.7.7 DNA-directed DNA polymerase.
## 3
## 4 5.99.1.3 DNA topoisomerase (ATP-hydrolyzing).
## 5 2.3.1.51 1-acylglycerol-3-phosphate O-acyltransferase.
## 6 3.1.3.-
col_to_remove <- intersect(colnames(pa_go),colnames(resdf))
col_to_remove <- setdiff(col_to_remove, "Alias") # returns all except the second vector
pa_go <- pa_go %>% select(-all_of(col_to_remove))
resdf <- full_join(resdf, rpkm, by = "gene_id", copy = FALSE)
resdf <- full_join(resdf, cpm, by = "gene_id", copy = FALSE)
resdf <- full_join(resdf, rawcounts2, by = "gene_id", copy = FALSE)
#resdf <- full_join(resdf, pa_go, by = "Alias", copy = FALSE)
resdf <- merge(resdf, pa_go, by = "Alias", copy = FALSE)
head(resdf)## Alias baseMean log2FoldChange lfcSE stat pvalue
## 1 PA14_00010 1780.6985 -0.4694423 0.1477369 -3.177557 1.485214e-03
## 2 PA14_00020 1603.8835 0.3530781 0.1607698 2.196171 2.807968e-02
## 3 PA14_00030 535.8251 -0.8730243 0.1754552 -4.975767 6.498978e-07
## 4 PA14_00050 2859.9504 -0.8078822 0.1414506 -5.711408 1.120450e-08
## 5 PA14_00060 342.5872 -0.3013717 0.1903226 -1.583479 1.133124e-01
## 6 PA14_00070 197.1930 -0.5169827 0.2218758 -2.330054 1.980329e-02
## padj gene_id seqnames start end width strand source type
## 1 3.361335e-03 gene1650835 chromosome 483 2027 1545 + PseudoCAP gene
## 2 4.966281e-02 gene1650837 chromosome 2056 3159 1104 + PseudoCAP gene
## 3 2.238666e-06 gene1650839 chromosome 3169 4278 1110 + PseudoCAP gene
## 4 4.540645e-08 gene1650841 chromosome 4275 6695 2421 + PseudoCAP gene
## 5 1.690484e-01 gene1650843 chromosome 7018 7791 774 - PseudoCAP gene
## 6 3.632357e-02 gene1650845 chromosome 7803 8339 537 - PseudoCAP gene
## score phase Name Dbxref name Parent locus SK064_rpkm SK065_rpkm
## 1 NA 0 GeneID:4384099 dnaA <NA> 8.176179 8.165287
## 2 NA 0 GeneID:4384100 dnaN <NA> 7.842995 8.122956
## 3 NA 0 GeneID:4384101 recF <NA> 7.029755 6.934080
## 4 NA 0 GeneID:4384102 gyrB <NA> 8.244496 8.285574
## 5 NA 0 GeneID:4385186 PA14_00060 <NA> 6.823946 6.633153
## 6 NA 0 GeneID:4385187 PA14_00070 <NA> 6.591050 6.619830
## SK066_rpkm SK085_rpkm SK086_rpkm SK087_rpkm SK064_cpm SK065_cpm SK066_cpm
## 1 7.893669 7.989449 7.833958 7.830711 8.802026 8.791120 8.519135
## 2 8.048415 8.605764 8.744455 8.540827 7.985143 8.265208 8.190642
## 3 6.988440 6.411482 6.421924 6.336705 7.179220 7.083470 7.137873
## 4 8.205787 7.620694 7.747847 7.774559 9.517304 9.558461 9.478520
## 5 6.444121 6.561349 6.716019 6.542347 6.458065 6.267796 6.079357
## 6 6.152874 6.368749 6.138494 6.146263 5.706889 5.735417 5.273245
## SK085_cpm SK086_cpm SK087_cpm SK064_rawcounts SK065_rawcounts SK066_rawcounts
## 1 8.615052 8.459333 8.456080 2234 2528 1719
## 2 8.748155 8.886879 8.683202 1266 1754 1368
## 3 6.560362 6.570815 6.485495 722 770 657
## 4 8.891988 9.019505 9.046289 3671 4307 3347
## 5 6.196208 6.350426 6.177265 436 435 313
## 6 5.486717 5.259038 5.266713 257 299 177
## SK085_rawcounts SK086_rawcounts SK087_rawcounts locusId accession GI
## 1 1579 1694 1158 2194572 YP_788156.1 116053721
## 2 1732 2280 1356 2194573 YP_788157.1 116053722
## 3 377 454 293 2194574 YP_788158.1 116053723
## 4 1914 2500 1745 2194575 YP_788159.1 116053724
## 5 292 389 236 2194576 YP_788160.1 116053725
## 6 177 180 124 2194577 YP_788161.1 116053726
## scaffoldId stop desc COG
## 1 4582 2027 chromosomal replication initiator protein DnaA (NCBI) COG593
## 2 4582 3159 DNA polymerase III, beta chain (NCBI) COG592
## 3 4582 4278 DNA replication and repair protein RecF (NCBI) COG1195
## 4 4582 6695 DNA gyrase subunit B (NCBI) COG187
## 5 4582 7018 putative acyltransferase (NCBI) COG204
## 6 4582 7803 putative histidinol-phosphatase (NCBI) COG241
## COGFun
## 1 L
## 2 L
## 3 L
## 4 L
## 5 I
## 6 E
## COGDesc
## 1 ATPase involved in DNA replication initiation
## 2 DNA polymerase sliding clamp subunit (PCNA homolog)
## 3 Recombinational DNA repair ATPase (RecF pathway)
## 4 Type IIA topoisomerase (DNA gyrase/topo II, topoisomerase IV), B subunit
## 5 1-acyl-sn-glycerol-3-phosphate acyltransferase
## 6 Histidinol phosphatase and related phosphatases
## TIGRFam
## 1 TIGR00362 chromosomal replication initiator protein DnaA [dnaA]
## 2 TIGR00663 DNA polymerase III, beta subunit [dnaN]
## 3 TIGR00611 DNA replication and repair protein RecF [recF]
## 4 TIGR01059 DNA gyrase, B subunit [gyrB]
## 5
## 6 TIGR01656 histidinol-phosphate phosphatase domain,TIGR01662 HAD hydrolase, family IIIA
## TIGRRoles
## 1 DNA metabolism:DNA replication, recombination, and repair
## 2 DNA metabolism:DNA replication, recombination, and repair
## 3 DNA metabolism:DNA replication, recombination, and repair
## 4 DNA metabolism:DNA replication, recombination, and repair
## 5
## 6 Unknown function:Enzymes of unknown specificity
## GO
## 1 GO:0006270,GO:0006275,GO:0003688,GO:0017111,GO:0005524
## 2 GO:0006260,GO:0003677,GO:0003893,GO:0008408,GO:0016449,GO:0019984,GO:0003889,GO:0003894,GO:0015999,GO:0016450,GO:0003890,GO:0003895,GO:0016000,GO:0016451,GO:0003891,GO:0016448,GO:0016452
## 3 GO:0006281,GO:0005694,GO:0005524,GO:0017111,GO:0003697
## 4 GO:0006304,GO:0006265,GO:0005694,GO:0003918,GO:0005524
## 5 GO:0008152,GO:0003841
## 6 GO:0000105,GO:0004401
## EC ECDesc
## 1
## 2 2.7.7.7 DNA-directed DNA polymerase.
## 3
## 4 5.99.1.3 DNA topoisomerase (ATP-hydrolyzing).
## 5 2.3.1.51 1-acylglycerol-3-phosphate O-acyltransferase.
## 6 3.1.3.-
## [1] 5780 54
## Alias baseMean log2FoldChange lfcSE stat pvalue
## 1 PA14_00010 1780.6985 -0.4694423 0.1477369 -3.177557 1.485214e-03
## 2 PA14_00020 1603.8835 0.3530781 0.1607698 2.196171 2.807968e-02
## 3 PA14_00030 535.8251 -0.8730243 0.1754552 -4.975767 6.498978e-07
## 4 PA14_00050 2859.9504 -0.8078822 0.1414506 -5.711408 1.120450e-08
## 5 PA14_00060 342.5872 -0.3013717 0.1903226 -1.583479 1.133124e-01
## 6 PA14_00070 197.1930 -0.5169827 0.2218758 -2.330054 1.980329e-02
## padj gene_id seqnames start end width strand source type
## 1 3.361335e-03 gene1650835 chromosome 483 2027 1545 + PseudoCAP gene
## 2 4.966281e-02 gene1650837 chromosome 2056 3159 1104 + PseudoCAP gene
## 3 2.238666e-06 gene1650839 chromosome 3169 4278 1110 + PseudoCAP gene
## 4 4.540645e-08 gene1650841 chromosome 4275 6695 2421 + PseudoCAP gene
## 5 1.690484e-01 gene1650843 chromosome 7018 7791 774 - PseudoCAP gene
## 6 3.632357e-02 gene1650845 chromosome 7803 8339 537 - PseudoCAP gene
## score phase Name Dbxref name Parent locus SK064_rpkm SK065_rpkm
## 1 NA 0 GeneID:4384099 dnaA <NA> 8.176179 8.165287
## 2 NA 0 GeneID:4384100 dnaN <NA> 7.842995 8.122956
## 3 NA 0 GeneID:4384101 recF <NA> 7.029755 6.934080
## 4 NA 0 GeneID:4384102 gyrB <NA> 8.244496 8.285574
## 5 NA 0 GeneID:4385186 PA14_00060 <NA> 6.823946 6.633153
## 6 NA 0 GeneID:4385187 PA14_00070 <NA> 6.591050 6.619830
## SK066_rpkm SK085_rpkm SK086_rpkm SK087_rpkm SK064_cpm SK065_cpm SK066_cpm
## 1 7.893669 7.989449 7.833958 7.830711 8.802026 8.791120 8.519135
## 2 8.048415 8.605764 8.744455 8.540827 7.985143 8.265208 8.190642
## 3 6.988440 6.411482 6.421924 6.336705 7.179220 7.083470 7.137873
## 4 8.205787 7.620694 7.747847 7.774559 9.517304 9.558461 9.478520
## 5 6.444121 6.561349 6.716019 6.542347 6.458065 6.267796 6.079357
## 6 6.152874 6.368749 6.138494 6.146263 5.706889 5.735417 5.273245
## SK085_cpm SK086_cpm SK087_cpm SK064_rawcounts SK065_rawcounts SK066_rawcounts
## 1 8.615052 8.459333 8.456080 2234 2528 1719
## 2 8.748155 8.886879 8.683202 1266 1754 1368
## 3 6.560362 6.570815 6.485495 722 770 657
## 4 8.891988 9.019505 9.046289 3671 4307 3347
## 5 6.196208 6.350426 6.177265 436 435 313
## 6 5.486717 5.259038 5.266713 257 299 177
## SK085_rawcounts SK086_rawcounts SK087_rawcounts locusId accession GI
## 1 1579 1694 1158 2194572 YP_788156.1 116053721
## 2 1732 2280 1356 2194573 YP_788157.1 116053722
## 3 377 454 293 2194574 YP_788158.1 116053723
## 4 1914 2500 1745 2194575 YP_788159.1 116053724
## 5 292 389 236 2194576 YP_788160.1 116053725
## 6 177 180 124 2194577 YP_788161.1 116053726
## scaffoldId stop desc COG
## 1 4582 2027 chromosomal replication initiator protein DnaA (NCBI) COG593
## 2 4582 3159 DNA polymerase III, beta chain (NCBI) COG592
## 3 4582 4278 DNA replication and repair protein RecF (NCBI) COG1195
## 4 4582 6695 DNA gyrase subunit B (NCBI) COG187
## 5 4582 7018 putative acyltransferase (NCBI) COG204
## 6 4582 7803 putative histidinol-phosphatase (NCBI) COG241
## COGFun
## 1 L
## 2 L
## 3 L
## 4 L
## 5 I
## 6 E
## COGDesc
## 1 ATPase involved in DNA replication initiation
## 2 DNA polymerase sliding clamp subunit (PCNA homolog)
## 3 Recombinational DNA repair ATPase (RecF pathway)
## 4 Type IIA topoisomerase (DNA gyrase/topo II, topoisomerase IV), B subunit
## 5 1-acyl-sn-glycerol-3-phosphate acyltransferase
## 6 Histidinol phosphatase and related phosphatases
## TIGRFam
## 1 TIGR00362 chromosomal replication initiator protein DnaA [dnaA]
## 2 TIGR00663 DNA polymerase III, beta subunit [dnaN]
## 3 TIGR00611 DNA replication and repair protein RecF [recF]
## 4 TIGR01059 DNA gyrase, B subunit [gyrB]
## 5
## 6 TIGR01656 histidinol-phosphate phosphatase domain,TIGR01662 HAD hydrolase, family IIIA
## TIGRRoles
## 1 DNA metabolism:DNA replication, recombination, and repair
## 2 DNA metabolism:DNA replication, recombination, and repair
## 3 DNA metabolism:DNA replication, recombination, and repair
## 4 DNA metabolism:DNA replication, recombination, and repair
## 5
## 6 Unknown function:Enzymes of unknown specificity
## GO
## 1 GO:0006270,GO:0006275,GO:0003688,GO:0017111,GO:0005524
## 2 GO:0006260,GO:0003677,GO:0003893,GO:0008408,GO:0016449,GO:0019984,GO:0003889,GO:0003894,GO:0015999,GO:0016450,GO:0003890,GO:0003895,GO:0016000,GO:0016451,GO:0003891,GO:0016448,GO:0016452
## 3 GO:0006281,GO:0005694,GO:0005524,GO:0017111,GO:0003697
## 4 GO:0006304,GO:0006265,GO:0005694,GO:0003918,GO:0005524
## 5 GO:0008152,GO:0003841
## 6 GO:0000105,GO:0004401
## EC ECDesc
## 1
## 2 2.7.7.7 DNA-directed DNA polymerase.
## 3
## 4 5.99.1.3 DNA topoisomerase (ATP-hydrolyzing).
## 5 2.3.1.51 1-acylglycerol-3-phosphate O-acyltransferase.
## 6 3.1.3.-
## Mode FALSE
## logical 5780
## Mode FALSE
## logical 5780
sig_genes <- resdf[abs(resdf$log2FoldChange) >= logfc_cutoff & resdf$padj < pval_cutoff, ] %>% drop_na(log2FoldChange)
sig_genes$direction <- ifelse(sig_genes$log2FoldChange > 0, "up", "down")
summary(is.na(sig_genes$log2FoldChange))## Mode FALSE
## logical 765
## Mode FALSE
## logical 765
## [1] 765 55
## Alias baseMean log2FoldChange lfcSE stat pvalue
## 1 PA14_00080 467.67726 2.587064 0.1847156 14.005660 1.439361e-44
## 2 PA14_00560 216.38063 -2.030352 0.2329481 -8.715898 2.884636e-18
## 3 PA14_00570 229.32258 -3.012398 0.2389427 -12.607195 1.927300e-36
## 4 PA14_00580 73.25642 -2.162844 0.3376209 -6.406130 1.492598e-10
## 5 PA14_00590 62.74549 -2.210213 0.3833066 -5.766175 8.109065e-09
## 6 PA14_00630 425.80690 2.405223 0.1950717 12.329947 6.248789e-35
## padj gene_id seqnames start end width strand source type
## 1 3.331833e-43 gene1650847 chromosome 8671 10377 1707 + PseudoCAP gene
## 2 2.249782e-17 gene1650925 chromosome 57148 58476 1329 + PseudoCAP gene
## 3 3.442371e-35 gene1650927 chromosome 59018 59704 687 + PseudoCAP gene
## 4 7.016786e-10 gene1650929 chromosome 59735 60088 354 + PseudoCAP gene
## 5 3.323453e-08 gene1650931 chromosome 60241 60750 510 + PseudoCAP gene
## 6 1.051213e-33 gene1650937 chromosome 63701 63841 141 + PseudoCAP gene
## score phase Name Dbxref name Parent locus SK064_rpkm SK065_rpkm
## 1 NA 0 GeneID:4385188 PA14_00080 <NA> 4.107099 4.085597
## 2 NA 0 GeneID:4381738 exoT <NA> 5.658212 5.734095
## 3 NA 0 GeneID:4381739 PA14_00570 <NA> 7.184945 6.920300
## 4 NA 0 GeneID:4381740 PA14_00580 <NA> 6.123140 6.069983
## 5 NA 0 GeneID:4381741 PA14_00590 <NA> 5.339107 5.644514
## 6 NA 0 GeneID:4381744 PA14_00630 <NA> 7.484965 7.639493
## SK066_rpkm SK085_rpkm SK086_rpkm SK087_rpkm SK064_cpm SK065_cpm SK066_cpm
## 1 3.996607 6.706746 6.898945 6.939056 4.843465 4.821429 4.730142
## 2 5.805594 3.538913 4.181795 4.378246 6.061464 6.137710 6.209535
## 3 6.614281 4.405289 4.426778 3.828367 6.647838 6.384099 6.079357
## 4 6.119808 3.906260 4.302233 4.588845 4.662246 4.610469 4.658999
## 5 5.121183 3.960665 3.055283 3.545111 4.401519 4.700530 4.189039
## 6 7.667280 10.355004 10.184017 10.276869 4.706979 4.856679 4.883650
## SK085_cpm SK086_cpm SK087_cpm SK064_rawcounts SK065_rawcounts SK066_rawcounts
## 1 7.472477 7.665392 7.705641 139 156 120
## 2 3.918194 4.572322 4.771310 330 397 343
## 3 3.894362 3.915402 3.332293 498 472 313
## 4 2.573782 2.931684 3.196118 122 134 114
## 5 3.075622 2.241650 2.687791 101 143 81
## 6 7.535467 7.365320 7.457703 126 160 134
## SK085_rawcounts SK086_rawcounts SK087_rawcounts locusId accession GI
## 1 713 975 687 2194578 YP_788162.1 116053727
## 2 57 110 87 2194617 YP_788201.1 116053766
## 3 56 68 30 2194618 YP_788202.1 116053767
## 4 20 32 27 2194619 YP_788203.1 116053768
## 5 30 18 18 2194620 YP_788204.1 116053769
## 6 745 791 578 2194623 YP_788207.1 116053772
## scaffoldId stop desc COG COGFun
## 1 4582 10377 hypothetical protein (NCBI)
## 2 4582 58476 exoenzyme T (NCBI) COG5585 T
## 3 4582 59704 putative lipoprotein (NCBI) COG1462 M
## 4 4582 60088 putative lipoprotein (NCBI) COG4259 S
## 5 4582 60750 putative lipoprotein (NCBI) COG4380 S
## 6 4582 63841 hypothetical protein (NCBI)
## COGDesc TIGRFam
## 1
## 2 NAD+--asparagine ADP-ribosyltransferase
## 3 Uncharacterized protein involved in formation of curli polymers
## 4 Uncharacterized protein conserved in bacteria
## 5 Uncharacterized protein conserved in bacteria
## 6
## TIGRRoles GO
## 1 GO:0006118,GO:0020037,GO:0005506,GO:0009055
## 2 GO:0009405,GO:0006471,GO:0016020,GO:0005576,GO:0003956,GO:0005096
## 3 GO:0015031,GO:0042597
## 4
## 5 GO:0005975
## 6
## EC ECDesc direction
## 1 up
## 2 down
## 3 down
## 4 down
## 5 down
## 6 up
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# Variance stabilizing transformations
vsd <- vst(vst, blind = TRUE)
# PCA plot
pcaData <- plotPCA(vsd, intgroup = c("exp_12", "batch", "condition"), returnData = TRUE)## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
# Calculate the maximum absolute value among PC1 and PC2
max_abs_val <- max(c(abs(min(pcaData$PC1)), abs(min(pcaData$PC2)), abs(max(pcaData$PC1)), abs(max(pcaData$PC2))))
axis_size <- 10
theme_set(theme_minimal(base_family = "Arial"))
pca <- ggplot(pcaData, aes(PC1, PC2, color = condition, shape = condition)) +
geom_point(size = 3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
scale_colour_brewer(palette = "Dark2") +
scale_shape_manual(values = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)) +
coord_fixed(ratio = 1, xlim = c(-max_abs_val, max_abs_val), ylim = c(-max_abs_val, max_abs_val)) +
labs(color = "condition", shape = "condition") +
guides(shape = guide_legend(title = "Condition"), # Adjust size of shape legend
color = guide_legend(title = "Condition")) +
theme(
panel.grid = element_blank(),
panel.background = element_blank(),
axis.text = element_text(size = axis_size),
axis.ticks = element_line(linewidth = 0.2),
axis.line = element_line(linewidth = 0, color = "black"), # Set the axis line linewidth
panel.border = element_rect(fill = NA, linewidth = 0.5), # Set the panel border linewidth
legend.key = element_rect(fill = "white", color = NA) # Set legend key background to white and remove border
)
print(pca)cairo_pdf(paste0(Sys.Date(),"_pca_",exp, "_", treatment,".vs.", control,".pdf", sep = ""))
pca
dev.off()## quartz_off_screen
## 2
sequence <- c("treatment")
for (i in sequence) {
set.seed(0)
res <- results(dds_final, contrast = c("exp_12", i, "control"))
gp2 <- wes_palettes$Zissou1
key <- read.csv("../../raw-data/pa14_gene_key.csv", header = TRUE)
rownames(key) <- key$gene_id
res <- merge(key, as.data.frame(res), by=0)
res <- as.data.frame(res)
res$Alias <- rownames(res)
# Define gene list
alg_genes <- c(
"algA", "algF", "algJ", "algI", "algL", "algX", "algG", "algE",
"algK", "alg44", "alg8", "algD", "mucC", "mucB", "algU", "algW",
"algP", "algQ", "algR", "algZ", "algC", "algB"
)
logfc_cutoff
keyvals <- ifelse(
res$log2FoldChange <= -logfc_cutoff & res$padj <= pval_cutoff, gp2[1],
ifelse(res$log2FoldChange >= logfc_cutoff & res$padj <= pval_cutoff, gp2[5],
'lightgrey'))
keyvals[is.na(keyvals)] <- 'lightgrey'
names(keyvals)[keyvals == gp2[5]] <- 'Up-regulated'
names(keyvals)[keyvals == 'lightgrey'] <- 'NS'
names(keyvals)[keyvals == gp2[1]] <- 'Down-regulated'
volcano <- EnhancedVolcano(res,
lab = res[["name"]],
selectLab = alg_genes,
x = 'log2FoldChange',
y = 'padj',
xlab = bquote(~Log[2]~ 'fold change'),
pCutoff = pval_cutoff,
FCcutoff = logfc_cutoff,
cutoffLineType = "dashed",
pointSize = 1,
labSize = 3.0,
axisLabSize = 14,
labCol = 'black',
labFace = 'bold',
#col = c('grey', gp2[2], gp2[3], gp2[1]),
colCustom = keyvals,
colAlpha = 5/5,
legendPosition = 'bottom',
legendLabSize = 12,
legendIconSize = 4.0,
drawConnectors = TRUE,
arrowheads = FALSE,
widthConnectors = 0.2,
gridlines.major = FALSE,
borderWidth = 0.3,
gridlines.minor = FALSE,
title = paste0(treatment," vs ", control, sep = ""),
subtitle = "",
#caption = paste0("total = ", nrow(toptable), " variables"),
caption = paste0("Volcano plot with log2 FC of ", logfc_cutoff," and P value of ",pval_cutoff, "."),
max.overlaps = 30,
#ylim = c(0,10),
colConnectors = 'black') +
theme(axis.ticks=element_line(linewidth =0.2)) # +
# scale_x_continuous(breaks = c(-5, -2.5, 0, 2.5, 5), limits = c(-6,6))
print(volcano)
cairo_pdf(paste0(Sys.Date(),"_volcano_", exp, "_", treatment,".vs.", control,".pdf", sep = ""))
print(volcano)
dev.off()
}## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...
sequence <- c("treatment")
for (i in sequence) {
set.seed(0)
res <- results(dds_final, contrast = c("exp_12", i, "control"))
gp2 <- wes_palettes$Zissou1
res$gene_id <- rownames(res)
res <- as.data.frame(res)
head(res)
res <- full_join(key, as.data.frame(res), by="gene_id")
rownames(res) <- res$Alias
any(is.na(res)) # Check if there are any NAs
res <- res %>% filter(!is.na(baseMean))
ma <- ggmaplot(res,
fdr = pval_cutoff,
fc = 2^(logfc_cutoff),
size = 1,
palette = c((gp2[5]), (gp2[1])),
select.top.method = "fc",
# ylim = c(-15,15),
legend = "bottom",
title = paste0(treatment," vs ", control, sep = ""),
subtitle = paste0("Volcano plot with log2 FC of ", logfc_cutoff," and P value of ",pval_cutoff, "."),
font.label = c( 12),
label.rectangle = FALSE,
genenames = res$name,
font.legend = c(12),
font.main = c("bold", 14),
alpha = 1.5,
ggtheme = ggplot2::theme(panel.grid =element_blank(),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14),
panel.background = element_blank(),
panel.border = element_rect(fill = NA)))
print(ma)
cairo_pdf(paste0(Sys.Date(),"_ma_", exp, "_", treatment,".vs.", control,".pdf"))
print(ma)
dev.off()
}## ========================================
## circlize version 0.4.16
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## SK064 SK065 SK066 SK085 SK086 SK087
## gene1650835 2234 2528 1719 1579 1694 1158
## gene1650837 1266 1754 1368 1732 2280 1356
## gene1650839 722 770 657 377 454 293
## gene1650841 3671 4307 3347 1914 2500 1745
## gene1650843 436 435 313 292 389 236
## gene1650845 257 299 177 177 180 124
## seqnames start end width strand source type score phase
## gene1650835 chromosome 483 2027 1545 + PseudoCAP gene NA 0
## gene1650837 chromosome 2056 3159 1104 + PseudoCAP gene NA 0
## gene1650839 chromosome 3169 4278 1110 + PseudoCAP gene NA 0
## gene1650841 chromosome 4275 6695 2421 + PseudoCAP gene NA 0
## gene1650843 chromosome 7018 7791 774 - PseudoCAP gene NA 0
## gene1650845 chromosome 7803 8339 537 - PseudoCAP gene NA 0
## gene_id Name Dbxref Alias name Parent locus
## gene1650835 gene1650835 GeneID:4384099 PA14_00010 dnaA <NA>
## gene1650837 gene1650837 GeneID:4384100 PA14_00020 dnaN <NA>
## gene1650839 gene1650839 GeneID:4384101 PA14_00030 recF <NA>
## gene1650841 gene1650841 GeneID:4384102 PA14_00050 gyrB <NA>
## gene1650843 gene1650843 GeneID:4385186 PA14_00060 PA14_00060 <NA>
## gene1650845 gene1650845 GeneID:4385187 PA14_00070 PA14_00070 <NA>
## gene_id Alias name baseMean log2FoldChange lfcSE
## PA14_00010 gene1650835 PA14_00010 dnaA 1780.6985 -0.4694423 0.1477369
## PA14_00020 gene1650837 PA14_00020 dnaN 1603.8835 0.3530781 0.1607698
## PA14_00030 gene1650839 PA14_00030 recF 535.8251 -0.8730243 0.1754552
## PA14_00050 gene1650841 PA14_00050 gyrB 2859.9504 -0.8078822 0.1414506
## PA14_00060 gene1650843 PA14_00060 PA14_00060 342.5872 -0.3013717 0.1903226
## PA14_00070 gene1650845 PA14_00070 PA14_00070 197.1930 -0.5169827 0.2218758
## stat pvalue padj
## PA14_00010 -3.177557 1.485214e-03 3.361335e-03
## PA14_00020 2.196171 2.807968e-02 4.966281e-02
## PA14_00030 -4.975767 6.498978e-07 2.238666e-06
## PA14_00050 -5.711408 1.120450e-08 4.540645e-08
## PA14_00060 -1.583479 1.133124e-01 1.690484e-01
## PA14_00070 -2.330054 1.980329e-02 3.632357e-02
rownames(res) <- res$gene_id
gff_annot <- gff_annot[rownames(res),]
summary(rownames(gff_annot) == rownames(res))## Mode TRUE
## logical 5787
## seqnames start end width strand source type score phase
## gene1650835 chromosome 483 2027 1545 + PseudoCAP gene NA 0
## gene1650837 chromosome 2056 3159 1104 + PseudoCAP gene NA 0
## gene1650839 chromosome 3169 4278 1110 + PseudoCAP gene NA 0
## gene1650841 chromosome 4275 6695 2421 + PseudoCAP gene NA 0
## gene1650843 chromosome 7018 7791 774 - PseudoCAP gene NA 0
## gene1650845 chromosome 7803 8339 537 - PseudoCAP gene NA 0
## gene_id Name Dbxref Alias name Parent locus
## gene1650835 gene1650835 GeneID:4384099 PA14_00010 dnaA <NA>
## gene1650837 gene1650837 GeneID:4384100 PA14_00020 dnaN <NA>
## gene1650839 gene1650839 GeneID:4384101 PA14_00030 recF <NA>
## gene1650841 gene1650841 GeneID:4384102 PA14_00050 gyrB <NA>
## gene1650843 gene1650843 GeneID:4385186 PA14_00060 PA14_00060 <NA>
## gene1650845 gene1650845 GeneID:4385187 PA14_00070 PA14_00070 <NA>
## log2FoldChange
## gene1650835 -0.4694423
## gene1650837 0.3530781
## gene1650839 -0.8730243
## gene1650841 -0.8078822
## gene1650843 -0.3013717
## gene1650845 -0.5169827
# Step 3: Create summary for each chromosome (usually just 1 in bacteria)
chr_ranges <- gff_annot |>
dplyr::group_by(seqnames) |>
dplyr::summarise(start = min(start), end = max(end)) |>
dplyr::ungroup()
cairo_pdf(paste0(Sys.Date(),"_circos_", exp, "_", treatment,".vs.", control,".pdf", sep = ""))
circos.clear()
circos.par(cell.padding = c(0, 0, 0, 0))
# Normalize log2FC to range [-1, 1]
max_fc <- max(abs(gff_annot$log2FoldChange), na.rm = TRUE)
gff_annot$norm_fc <- gff_annot$log2FoldChange / max_fc
# Assign color: red (up), blue (down), gray (not significant)
gff_annot$col <- "gray"
gff_annot$col[gff_annot$log2FoldChange > 2 & res$padj < 0.01] <- rgb(1, 0, 0, alpha = 0.7) # red
gff_annot$col[gff_annot$log2FoldChange < -2 & res$padj < 0.01] <- rgb(0, 0, 1, alpha = 0.7) # blue
# Initialize circos plot
circos.initialize(factors = chr_ranges$seqnames, xlim = chr_ranges[, c("start", "end")])
# Track plotting
circos.trackPlotRegion(
ylim = c(-1, 1), # allow bars to go below and above axis
track.height = 0.2,
panel.fun = function(region, value, ...) {
sector <- get.cell.meta.data("sector.index")
gene_data <- gff_annot[gff_annot$seqnames == sector, ]
circos.rect(
xleft = gene_data$start,
ybottom = 0,
xright = gene_data$end,
ytop = gene_data$norm_fc,
col = gene_data$col,
border = NA
)
},
bg.border = NA,
bg.col = "white"
)
# Define gene list
alg_genes <- c(
"algA", "algF", "algJ", "algI", "algL", "algX", "algG", "algE",
"algK", "alg44", "alg8", "algD", "mucC", "mucB", "algU", "algW",
"algP", "algQ", "algR", "algZ", "algC", "algB", "dnaA"
)
alg_genes_found <- gff_annot[gff_annot$name %in% alg_genes | gff_annot$Alias %in% alg_genes, ]
gff_annot$highlight <- ifelse(gff_annot$name %in% alg_genes | gff_annot$Alias %in% alg_genes, TRUE, FALSE)
circos.trackPlotRegion(
ylim = c(0, 1),
track.height = 0.05,
panel.fun = function(...) {
sector <- get.cell.meta.data("sector.index")
gene_data <- gff_annot[gff_annot$seqnames == sector & gff_annot$highlight == TRUE, ]
if(nrow(gene_data) == 0) return()
# Draw rectangles or points at gene start position
circos.rect(
xleft = gene_data$start,
ybottom = 0,
xright = gene_data$end,
ytop = 1,
col = "gold", # highlight color
border = "black"
)
# Optional: add gene labels
circos.text(
x = (gene_data$start + gene_data$end) / 2,
y = 1.05,
labels = gene_data$name,
facing = "clockwise",
niceFacing = TRUE,
cex = 0.5,
adj = c(0, 0.5)
)
},
bg.border = NA
)## Note: 23 points are out of plotting region in sector 'chromosome',
## track '2'.
## Note: 3 points are out of plotting region in sector 'chromosome', track
## '2'.
## Note: 20 points are out of plotting region in sector 'chromosome',
## track '2'.
# Draw an empty track (no data) for the tick marks and labels
circos.trackPlotRegion(
ylim = c(0, 1),
track.height = 0.05,
bg.border = NA,
panel.fun = function(x, y) {
sector <- get.cell.meta.data("sector.index")
xlim <- get.cell.meta.data("xlim")
ylim <- get.cell.meta.data("ylim")
circos.center <- get.cell.meta.data("cell.start.degree") # for possible angle reference
# Create tick positions every 1Mb within sector range
ticks <- seq(from = ceiling(xlim[1] / 1e6) * 1e6, to = xlim[2], by = 1e6)
# Draw vertical ticks (lines) on circle at each tick position
circos.segments(
x0 = ticks, y0 = 0,
x1 = ticks, y1 = 0.2,
sector.index = sector,
col = "black",
lwd = 1
)
# Add labels a little outside the ticks, showing "X Mb"
label_pos <- ticks + 5e4 # slightly offset to the right, adjust if needed
# Convert label position to Mb and add "Mb"
labels <- paste0(ticks / 1e6, " Mb")
circos.text(
x = ticks,
y = 0.25,
labels = labels,
sector.index = sector,
facing = "clockwise",
niceFacing = TRUE,
adj = c(0, 0.5),
cex = 0.6
)
}
)
# Example: define significant DEGs (adjust your threshold as needed)
gff_annot$padj <- res$padj
sig_degs <- gff_annot[!is.na(gff_annot$padj) & gff_annot$padj < 0.01, ]
num_up <- sum(sig_degs$log2FoldChange > 2)
num_down <- sum(sig_degs$log2FoldChange < -2)
library(grid)
# After all circos plotting
grid.text(
label = paste0("Up: ", num_up, "\nDown: ", num_down),
x = 0.5, y = 0.5, # Center of plotting area
gp = gpar(fontsize = 14, col = "black")
)
dev.off()## quartz_off_screen
## 2