library(devtools)
devtools::load_all()
This demo was conducted with an early version of sCubed and metadata that were in an intermediate stage of transformation. The upcoming package, R/SquareCube, was originally called sCubed as reflected in code found here.
Define a default theme to maintain consistency across plots generated
using ggplot2
.
applyDefaultTheme <- ggplot2::theme(
axis.title = ggplot2::element_text(size = 12, face = "bold"),
axis.text = ggplot2::element_text(size = 12, face = "plain"),
legend.title = ggplot2::element_text(size = 12, face = "bold"),
legend.text = ggplot2::element_text(size = 12, face = "plain")
)
Define color schemes for processes and key physical material.
kColorsProcessTypes <-
c(
"aliquoting" = "#E6C4B3",
"bead-based homogenization" = "#F2A7A0",
"autoclave" = "#CC8866",
"centrifugation" = "#CC5012",
"contamination measurement" = "#6B522E",
"counting" = "#DCE1DD",
"establishing culture growth environment" = "#BABCB2",
"fermentation" = "#D9B482",
"Freeze-Drying" = "#A78250",
"harvest" = "#542003",
"microbial culture procedure" = "#F1E3D0",
"open" = "#FF6657",
"optical density measurement" = "#88DDDD",
"passage" = "#0BB2AE",
"resuspension" = "#000000",
"seed" = "#FF9933",
"snap freeze" = "#E87B2F",
"storage" = "#e4dbce",
"spot bleach" = "#9D1A0E",
"weight measurement" = "#bcd8cd",
"storage" = "#533640",
"substance combination" = "#333333",
"transfer" = "#eee7dc",
"volume measurement" = "#B33F4B",
"wash" = "#ea5d36",
"weight measurement" = "#794E68"
)
# Colors were established to closely match the colors in https://doi.org/10.1021/acs.analchem.1c01294
kColorsBacteriaProductionBatches <-
c(
"bac_1_organism_batch" = "#8D0402",
"bac_2_organism_batch" = "#FE8C41",
"bac_3_organism_batch" = "#E1A437",
"bac_4_organism_batch" = "#D7E08E",
"bac_5_organism_batch" = "#67BBA9",
"bac_6_organism_batch" = "#7FA64F",
"bac_7_organism_batch" = "#5ECD89",
"bac_8_organism_batch" = "#33D0F1",
"bac_9_organism_batch" = "#0081C0",
"bac_10_organism_batch" = "#010101",
"bac_11_organism_batch" = "#6F3788",
"bac_12_organism_batch" = "#C03090",
"bac_13_organism_batch" = "#EC8BAC",
"bac_14_organism_batch" = "#FFFABD",
"bac_15_organism_batch" = "#F0F0F1",
"bac_16_organism_batch" = "#AEDBA1",
"bac_17_organism_batch" = "#00C29A",
"bac_18_organism_batch" = "#62B1BF",
"bac_19_organism_batch" = "#4558E8",
"bac_20_organism_batch" = "#4C4259"
)
# This color scheme comes from scales::show_col(ggsci::pal_ucscgb("default")(21))
kColorsPooledBacteria <-
c(
"avb_31" = "#FF0000FF",
"avb_32" = "#FF9900FF",
"avb_39" = "#FFCC00FF",
"avb_35" = "#00FF00FF",
"avb_44" = "#6699FFFF",
"avb_40" = "#CC33FFFF",
"avb_25" = "#99991EFF",
"avb_24" = "#999999FF",
"avb_3" = "#FF00CCFF",
"avb_7" = "#CC0000FF",
"avb_6" = "#FFCCCCFF",
"avb_45" = "#FFFF00FF",
"avb_8" = "#CCFF00FF",
"avb_15" = "#358000FF",
"avb_14" = "#0000CCFF",
"avb_13" = "#99CCFFFF",
"avb_41" = "#00FFFFFF",
"avb_17" = "#CCFFFFFF",
"avb_92" = "#9900CCFF",
"avb_93" = "#CC99FFFF",
"avb_26" = "#996600FF"
)
GetGreyColorRamp <- grDevices::colorRampPalette(c("#f6f6f6", "#22262b"))
kGreyScale <- GetGreyColorRamp(24)
R/SquareCube includes a function,
TransformProcessExecutionSheet
, to prepare metadata about
investigative activities captured in the processExecution
Sheet of sCubed.
The function TransformProcessExecutionSheet
# define the url for the spreadsheet that contains executed processes
processExecutionSheetUrl <-
"https://docs.google.com/spreadsheets/d/1rZoh2eEopqE4rrAGBUyWupEi9Y7tUYdK-BWzMNVf0xs/edit#gid=471072976"
# transform the executed processes so that they're prepped for querying provenance
transformedProcessExecutionSheet <- sCubed::TransformProcessExecutionSheet(processExecutionSheetUrl)
Fetch the planned processes.
# define the url for the spreadsheet that contains planned processes
processDefinitionSheetUrl <- "https://docs.google.com/spreadsheets/d/1rZoh2eEopqE4rrAGBUyWupEi9Y7tUYdK-BWzMNVf0xs/edit#gid=699312441"
# fetch the data
processDefinitionSpreadsheet <- gsheet::gsheet2text(processDefinitionSheetUrl, format = "csv")
processDefinitionSpreadsheet <- read.csv(text = processDefinitionSpreadsheet, stringsAsFactors = FALSE)
Fetch the workflow management entities and expand the mini tables.
workflowManagementSheetUrl <- "https://docs.google.com/spreadsheets/d/1rZoh2eEopqE4rrAGBUyWupEi9Y7tUYdK-BWzMNVf0xs/edit#gid=906212126"
workflowManagementSheet <- gsheet::gsheet2text(workflowManagementSheetUrl, format = "csv")
workflowManagementSheet <- read.csv(text = workflowManagementSheet)
workflowManagementSheet <- sCubed::ExpandMiniTableWorkflowManagement(workflowManagementSheet)
R/SquareCube enables scientists to overview entire investigations and invdividual methods. Here, we’ll overview an investigation conducted by the CIDC-UGA and look at an individual method called the Iterative Batch Averaging Technique (Gouveia et al., 2021).
To follow along with this demo, it’s helpful to know a few key things about the investigation conducted by the CIDC-UGA:
# Subset to only the data we need for the tile plot
reportingWorkflowDataTile <- workflowManagementSheet[
c("workflow_entity_type", "workflow_name", "workflow_sequence_number", "process_type")
]
reportingWorkflowDataTile <- reportingWorkflowDataTile[
reportingWorkflowDataTile$workflow_entity_type == "reporting workflow",
]
# Setup limits for y-axis (names of reporting workflows)
reportingWorkflowDataTile$ymin <- 0
reportingWorkflowDataTile$ymax <- 0.25
kStepNumberAxisBreaksReportingWorkflowTile <- seq(
1, max(reportingWorkflowDataTile$workflow_sequence_number), by = 1
)
plotReportingWorkflowTile <- ggplot2::ggplot(reportingWorkflowDataTile,
ggplot2::aes(
xmin = workflow_sequence_number + 0.5,
xmax = workflow_sequence_number - 0.5,
ymin = ymin,
ymax = ymax,
fill = process_type
)
) +
ggplot2::geom_rect(colour = "black", size = 0.5) +
ggplot2::theme_minimal() +
ggplot2::facet_wrap(~workflow_name, ncol = 1) +
ggplot2::theme(
legend.position = "top",
legend.title = ggplot2::element_text(face = "bold"),
legend.text = ggplot2::element_text(size = 8),
legend.background = ggplot2::element_rect(colour = "#fafafa", fill = "#fafafa"),
axis.title = ggplot2::element_text(face = "bold"),
axis.ticks.y = ggplot2::element_blank(),
axis.text.y = ggplot2::element_blank(),
legend.justification = "top",
axis.line.x = ggplot2::element_line(color = "black"),
panel.grid.major = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_blank(),
strip.text = ggplot2::element_text(hjust = 0.03)
) +
ggplot2::scale_fill_manual(values = kColorsProcessTypes) +
ggplot2::xlab("Step number") +
ggplot2::ylab("Reporting workflow") +
ggplot2::scale_x_continuous(breaks = kStepNumberAxisBreaksReportingWorkflowTile) +
ggplot2::guides(ggplot2::guides(
fill = ggplot2::guide_legend(
title = "Type of process",
title.position = "top",
nrow = 6
)
))
plotReportingWorkflowTile
Figure 1. Reporting workflows were defined by the investigators to reflect which steps in the investigation should be reported.
This demo is not intended to include sufficient info to fully understand the investigation conducted by the CIDC-UGA. Instead, this demo concentrates on only a few components of the CIDC-UGA investigation to quickly demo R/SquareCube. To learn more about the investigation conducted by the CIDC-UGA, check out the following publications:
Let’s look at the metadata to better understand a few key components the CIDC-UGA investigation.
Over the course of three years, the CIDC-UGA conducted a variety of activities, in vitro, to investigate the metabolome of worms.
# Subset to only the data we need for the stream plot
executedProcessesDataStream <- transformedProcessExecutionSheet[
!duplicated(transformedProcessExecutionSheet[c(
"workflow_instance_name",
"workflow_instance_step_number",
"repitition_number"
)]), ]
# Exclude specific process types
processTypesToExcludeFromStream <- c("weight measurement", "volume measurement")
executedProcessesDataStream <- executedProcessesDataStream[
!(executedProcessesDataStream$process_type %in% processTypesToExcludeFromStream), ]
# Filter by start date
executedProcessesDataStream <- executedProcessesDataStream[
executedProcessesDataStream[["start_date"]] >= "2018-01-01", ]
# Count the number of times process types were executed per day
countsProcessTypeExecutedPerDay <- dplyr::count(
executedProcessesDataStream, process_type, start_date, name = "count")
countsProcessTypeExecutedPerDay <- na.omit(countsProcessTypeExecutedPerDay)
countsProcessTypeExecutedPerDay$start_date <- as.Date(countsProcessTypeExecutedPerDay$start_date)
# Create the stream plot
plotStreamProcessTypeExecutedPerDay <- ggplot2::ggplot(countsProcessTypeExecutedPerDay,
ggplot2::aes(x = start_date, y = count, fill = process_type)) +
ggstream::geom_stream() +
ggplot2::scale_fill_manual(values = kColorsProcessTypes) +
ggplot2::scale_x_date(
date_labels = "%Y-%m",
date_breaks = "4 months",
expand = c(.1, .1)) +
ggplot2::theme_classic() +
ggplot2::theme(
legend.position = "top",
legend.title = ggplot2::element_text(face = "bold"),
legend.text = ggplot2::element_text(size = 8),
legend.background = ggplot2::element_rect(colour = "#fafafa", fill = "#fafafa"),
rect = ggplot2::element_rect(fill = "transparent", colour = "transparent", linewidth = 0)) +
ggplot2::xlab("Date") +
ggplot2::ylab("Count of activities") +
ggplot2::labs(fill = "Type of process") +
ggplot2::guides(fill = ggplot2::guide_legend(nrow = 6, byrow = TRUE, title.position = "top"))
plotStreamProcessTypeExecutedPerDay
Figure 2. A stream plot reveals the volume and variety of activities executed by the CIDC-UGA, including the impact of the COVID-19 pandemic. The volume of each stream represents the number of times an activity was executed at each timepoint. The position of each stream on the y-axis is insignificant.
The CIDC-UGA developed a method called the Iterative Batch Averaging Technique (IBAT; Gouveia et al., 2021). One purpose of IBAT was to reduce variance across sources of food. To execute IBAT, the CIDC-UGA pooled aliquots of bacteria that were generated via numerous production batches of bacteria.
First, let’s look at how investigator Gouevia produced batches of bacteria.
# customize vistime function to get markers in correct order
set_y_values <- function(data, optimize_y) {
if (optimize_y) {
data <- data[order(data$subplot, data$start, as.integer(factor(data$event, levels = unique(data$event)))), ]
}
row.names(data) <- 1:nrow(data)
for (i in unique(data$subplot)) {
thisGroup <- data[data$subplot == i, ]
thisGroup$y <- 0
if (optimize_y) {
for (row in seq_len(nrow(thisGroup))) {
toAdd <- thisGroup[row, ]
for (candidate_y in rev(0:(nrow(thisGroup) + 1))) {
all_on_current_y <- thisGroup[thisGroup$y == candidate_y, ][-row, ]
conflict_seen <- FALSE
for (j in seq_len(nrow(all_on_current_y))) {
conflict_seen <- conflict_seen |
toAdd$start == all_on_current_y[j, "start"] |
toAdd$end == all_on_current_y[j, "end"] |
toAdd$start == toAdd$end & toAdd$start >= all_on_current_y[j, "start"] & toAdd$start <= all_on_current_y[j, "end"] |
all_on_current_y[j, "start"] == all_on_current_y[j, "end"] & toAdd$start <= all_on_current_y[j, "start"] & toAdd$end >= all_on_current_y[j, "end"] |
all_on_current_y[j, "start"] != all_on_current_y[j, "end"] & toAdd$start != toAdd$end &
(toAdd$start > all_on_current_y[j, "start"] & toAdd$start < all_on_current_y[j, "end"] |
toAdd$end > all_on_current_y[j, "start"] & toAdd$end < all_on_current_y[j, "end"])
}
if (!isTRUE(conflict_seen)) {
thisGroup$y[row] <- candidate_y
break
} else {
next
}
}
}
} else {
thisGroup$y <- seq_len(nrow(thisGroup))
}
data[data$subplot == i, "y"] <- max(thisGroup$y) - thisGroup$y + 1
}
data$y <- as.numeric(data$y)
data$y[is.na(data$y)] <- max(data$y[!is.na(data$y)]) + 1
adds <- cumsum(rev(unname(by(data, data$subplot, function(subplot) max(subplot$y)))))
adds <- c(0, adds[-length(adds)] + seq_len(length(adds) - 1))
y_add <- rev(rep(adds, times = rev(table(data$subplot))))
data$y <- data$y + y_add
return(data)
}
# customize vistime function to return both plot and data
custom_gg_vistime <- function(data,
col.event = "event",
col.start = "start",
col.end = "end",
col.group = "group",
col.color = "color",
col.fontcolor = "fontcolor",
optimize_y = TRUE, linewidth = NULL, title = NULL,
show_labels = TRUE, background_lines = NULL, ...) {
checked_dat <- validate_input(data, col.event, col.start, col.end, col.group, col.color,
col.fontcolor, col.tooltip = NULL, optimize_y, linewidth, title,
show_labels, background_lines, list(...)
)
cleaned_dat <- vistime_data(
checked_dat$data, checked_dat$col.event, checked_dat$col.start,
checked_dat$col.end, checked_dat$col.group, checked_dat$col.color,
checked_dat$col.fontcolor, checked_dat$col.tooltip, optimize_y
)
names(cleaned_dat)[names(cleaned_dat) == "x"] <- "x2"
names(cleaned_dat)[names(cleaned_dat) == "y"] <- "x"
names(cleaned_dat)[names(cleaned_dat) == "x2"] <- "y"
total <- plot_ggplot(cleaned_dat, linewidth, title, show_labels, background_lines)
total <- list("plot" = total, "data" = cleaned_dat)
return(total)
}
# customize vistime function to modify shape and size of markers
custom_plot_ggplot <- function(data, linewidth, title, show_labels, background_lines) {
# 1. Prepare basic plot
x_ticks <- tapply(data$x, data$subplot, mean)
gg <- ggplot2::ggplot(data, ggplot2::aes(x = .data$x, y = .data$start, xend = .data$x, yend = .data$end, color = I(.data$col))) +
ggplot2::ggtitle(title) +
ggplot2::labs(x = NULL, y = NULL) +
ggplot2::scale_x_continuous(breaks = x_ticks, labels = unique(data$group)) + #
ggplot2::theme_classic() +
ggplot2::theme(
plot.title = ggplot2::element_text(hjust = 0.5),
axis.ticks.x = ggplot2::element_blank(),
axis.text.x = ggplot2::element_text(hjust = 1),
# line = element_blank(),
panel.background = ggplot2::element_rect(colour = "black", linetype = "solid")
) +
ggplot2::coord_cartesian(xlim = c(0.5, max(data$x) + 0.5))
# 2. Add background vertical lines
if (is.null(background_lines)) {
gg <- gg + ggplot2::theme(
panel.grid.major.y = ggplot2::element_line(colour = "grey90"),
panel.grid.minor.y = ggplot2::element_line(colour = "grey90")
)
} else {
gg <- gg + ggplot2::geom_hline(yintercept = seq(min(c(data$start, data$end)), max(c(data$start, data$end)),
length.out = round(background_lines) + 1
), colour = "grey90")
}
# 3. Divide subplots with horizontal lines
gg <- gg + ggplot2::geom_vline(
xintercept = c(setdiff(seq_len(max(data$x)), data$x)),
colour = "grey65"
)
# Plot ranges and events
lw <- ifelse(is.null(linewidth), min(30, 100 / max(data$x)), linewidth) # MODIFIED min(30 to min(10
range_dat <- data[data$start != data$end, ]
event_dat <- data[data$start == data$end, ]
# event_dat <- rev(event_dat)
# range_dat <- rev(range_dat)
# data <- rev(data)
gg <- gg +
ggplot2::geom_segment(data = range_dat, size = lw) +
ggplot2::geom_point(
data = event_dat, mapping = ggplot2::aes(fill = I(.data$col)),
shape = 22, size = 0.7 * lw, colour = "black", stroke = 0.1
) # MODIFIED shape = 21 to 22
# Labels for Ranges in center of range
ranges <- data[data$start != data$end, ]
ranges$start <- ranges$start + (ranges$end - ranges$start) / 2
if (show_labels) {
gg <- gg +
ggplot2::geom_text(mapping = ggplot2::aes(colour = I(.data$fontcol), label = .data$label), data = ranges) +
ggplot2::geom_text_repel(
mapping = ggplot2::aes(colour = I(.data$fontcol), label = .data$label),
data = event_dat, direction = "y", segment.alpha = 0,
point.padding = grid::unit(0.75, "lines")
)
# , nudge_y = rep_len(c(0.3,-0.3), nrow(event_dat)))
}
return(gg)
}
# change vistime environ to reflect custom functions
environment(custom_gg_vistime) <- asNamespace("vistime")
assignInNamespace("gg_vistime", custom_gg_vistime, ns = "vistime")
assignInNamespace("plot_ggplot", custom_plot_ggplot, ns = "vistime")
assignInNamespace("set_y_values", set_y_values, ns = "vistime")
# subset to the data we need for timeline plot
executedProcessesForBacteriaBioreactor <- transformedProcessExecutionSheet[
transformedProcessExecutionSheet["reporting_workflow_name"] == "Goncalo's standard e. coli bioreactor", ]
executedProcessesForBacteriaBioreactor <- executedProcessesForBacteriaBioreactor[
executedProcessesForBacteriaBioreactor["start_date"] != "", ]
executedProcessesForBacteriaBioreactor <- executedProcessesForBacteriaBioreactor[
executedProcessesForBacteriaBioreactor["process_type"] != "", ]
executedProcessesForBacteriaBioreactor <- executedProcessesForBacteriaBioreactor[
!is.na(executedProcessesForBacteriaBioreactor$start_date), ]
executedProcessesForBacteriaBioreactor <- executedProcessesForBacteriaBioreactor[
!is.na(executedProcessesForBacteriaBioreactor$process_type), ]
executedProcessesForBacteriaBioreactor <- subset(executedProcessesForBacteriaBioreactor,
start_date >= as.Date("2018-01-01") &
start_date <= as.Date("2019-11-01"))
# prep data as expected by vistime pkg
executedProcessesForBacteriaBioreactor <- merge(executedProcessesForBacteriaBioreactor,
data.frame(process_type = names(kColorsProcessTypes),
color = kColorsProcessTypes,
row.names = NULL),
by = "process_type")
names(executedProcessesForBacteriaBioreactor)[names(executedProcessesForBacteriaBioreactor) == "start_date"] <- "start"
# order by date so we can read first-to-last by moving left-to-right
order_date <- unique(executedProcessesForBacteriaBioreactor$workflow_instance_name[
order(executedProcessesForBacteriaBioreactor$start)])
executedProcessesForBacteriaBioreactor <- executedProcessesForBacteriaBioreactor %>%
arrange(factor(workflow_instance_name, levels = rev(order_date)))
generateDataForIbatTimeline <- vistime::gg_vistime(
data = data.frame(executedProcessesForBacteriaBioreactor),
col.group = "workflow_instance_name",
col.event = "process_type",
linewidth = 20,
col.color = "color",
show_labels = FALSE,
optimize_y = TRUE
)
# setup rectangles that appear at top of timeline and represent batches of bacteria
# TODO Make setting xmin, ymin, and colors dynamic.
kXMinRectBacteriaBatchTimeline <- rev(c(
min(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_16", ]$x),
min(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_15", ]$x),
min(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_13", ]$x),
min(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_14", ]$x),
min(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_12", ]$x),
min(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_11", ]$x),
min(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_9", ]$x),
min(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_10", ]$x),
min(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_8", ]$x),
min(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_7", ]$x)
))
kXMaxRectBacteriaBatchTimeline <- rev(c(
max(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_16", ]$x),
max(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_15", ]$x),
max(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_13", ]$x),
max(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_14", ]$x),
max(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_12", ]$x),
max(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_11", ]$x),
max(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_9", ]$x),
max(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_10", ]$x),
max(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_8", ]$x),
max(generateDataForIbatTimeline$data[generateDataForIbatTimeline$data$group == "bac_7", ]$x)
))
kFillRectBacteriaBatchTimeline <- rev(c(
"#AEDBA1", "#F0F0F1", "#EC8BAC", "#FFFABD", "#C03090", "#6F3788", "#0081C0", "#010101",
"#33D0F1", "#5ECD89"
))
numberOfDaysAcrossExecutedProcesses <- abs(as.numeric(difftime(min(generateDataForIbatTimeline$data$start),
max(generateDataForIbatTimeline$data$end), units = c("days"))))
plotBasedIbatTimeline <- generateDataForIbatTimeline$plot +
ggplot2::theme_classic() +
ggplot2::theme(axis.title = ggplot2::element_text(face = "bold"),
axis.ticks.y = ggplot2::element_blank(),
axis.text.y = ggplot2::element_text(colour = c(rep("black",
numberOfDaysAcrossExecutedProcesses + 2),
rep("white",
3)))) +
ggplot2::scale_y_datetime(date_breaks = "1 day") +
ggplot2::ylab("Start date of activity") +
ggplot2::xlab("Instance of reporting workflow") +
ggplot2::annotate("rect",
xmin = kXMinRectBacteriaBatchTimeline,
xmax = kXMaxRectBacteriaBatchTimeline,
ymin = as.POSIXct("2019-08-09"),
ymax = as.POSIXct("2019-08-10"),
fill = kFillRectBacteriaBatchTimeline
)
# adjust size of markers
plotBasedIbatTimeline$layers[[3]]$aes_params$size <- 2.5
# make legend process types
plotLegendProcessTypes <- ggplot2::ggplot(data = executedProcessesForBacteriaBioreactor,
ggplot2::aes(workflow_instance_name, start, fill = process_type)) +
ggplot2::geom_col() +
ggplot2::scale_fill_manual(values = kColorsProcessTypes) +
ggplot2::theme(legend.position = "top",
legend.box.just = "left",
legend.title = ggplot2::element_text(face = "bold"),
legend.text = ggplot2::element_text(size = 8),
legend.background = ggplot2::element_rect(colour = "#fafafa",
fill = "#fafafa")) +
ggplot2::guides(fill = ggplot2::guide_legend(title = "Type of process",
title.position = "top",
face = "bold",
nrow = 3))
getLegendProcessTypes <- cowplot::get_legend(plotLegendProcessTypes)
# make legend production batches of bacteria
PlotLegendProductionBactchBacteria <- ggplot2::ggplot(data = executedProcessesForBacteriaBioreactor,
ggplot2::aes(workflow_instance_name,
start,
fill = output_organismal_entity_lot_number)) +
ggplot2::geom_col() +
ggplot2::scale_fill_manual(values = kColorsBacteriaProductionBatches) +
ggplot2::theme(
legend.position = "top",
legend.box.just = "left",
legend.title = ggplot2::element_text(face = "bold"),
legend.text = ggplot2::element_text(size = 8),
legend.background = ggplot2::element_rect(colour = "#fafafa", fill = "#fafafa")) +
ggplot2::guides(fill = ggplot2::guide_legend(title = "Production batch of bacteria",
title.position = "top",
face = "bold",
nrow = 3))
getLegendProductionBactchBacteria <- cowplot::get_legend(PlotLegendProductionBactchBacteria)
# plot timeline with legends
ggpubr::ggarrange(getLegendProductionBactchBacteria,
getLegendProcessTypes,
plotBasedIbatTimeline,
nrow = 3,
ncol = 1,
heights = c(.2, .2, .60))
Figure 3. Over approximately one month, Gouveia grew bacteria in 10 batches (colored bars at top). Gouveia reported activities per the reporting workflow Goncalo’s standard e. coli bioreactor.
Next, we can inspect the sources of food to learn which production batches of bacteria are present in each source of food.
In order to explore the sources of food and biological outcomes, we
must fetch the provenances for the samples of worms that we’re
interested in. R/SquareCube includes a function
GetProvenance
that fetches the ancestors of organismal
entities as well as the reported experimental activities for the sample
and its ancestors.
The
GetProvenance
function allows us to quickly fetch a history of experimentation as compared to manually piecing together provenance.
# Define our samples of interest
samplesOfInterest <- c("aos129_ga_ms2", "aos140_ga_ms3", "aos140_gt_ms3",
"aos149_ga_ms2", "aos167_ga_ms3", "aos167_ga_ms4",
"aos180_ga_ms2", "aos181_ga_ms2", "aos185_ga_ms2",
"aos188_ga_ms3", "aos27_ga_ms3", "aos27_ga_ms4",
"aos38_ga_ms2", "aos41_ga_ms2", "aos43_ga_bku",
"aos43_ga_ms4", "aos44_ga_ms2", "aos53_ga_ms2",
"aos54_ga_bku", "aos54_ga_bku2", "aos54_ga_ms4",
"aos69_ga_ms1", "aos69_ga_ms3", "aos75_ga_ms4",
"aos78_ga_ms4", "aos87_co_ms3", "aos87_ga_ms3",
"aos90_ga_ms3", "aos92_ga_ms2")
iter <- 0
for (oneSample in samplesOfInterest) {
provenanceOfInterest_temp <- sCubed::GetProvenance(transformedProcessExecutionSheet, oneSample)
provenanceOfInterest_temp$sample_id <- oneSample
provenanceOfInterest_temp$overall_step_number <- 1:nrow(provenanceOfInterest_temp)
if (iter != 0) {
provenanceOfInterest <- rbind(provenanceOfInterest, provenanceOfInterest_temp)
} else {
provenanceOfInterest <- provenanceOfInterest_temp
}
iter <- iter + 1
#save(provenanceOfInterest, file="history_validation.Rda")
#print(paste('finish iter', iter))
}
# Get executed processes for aliquoting
aliquotingActivitiesOfInterest <- provenanceOfInterest[provenanceOfInterest$parent == "aliquotting",]
# Subset aliquoting activities for bacteria bioreactor
aliquotingActivitiesOfInterest <- aliquotingActivitiesOfInterest[aliquotingActivitiesOfInterest$reporting_workflow_name == "Goncalo's standard e. coli bioreactor",]
# Get executed processes for sample pooling
samplePoolingActivitiesOfInterest <- provenanceOfInterest[provenanceOfInterest$parent == "sample pooling",]
# Filter to only the aliquots that we previously fetched
samplePoolingActivitiesOfInterest <- samplePoolingActivitiesOfInterest[samplePoolingActivitiesOfInterest$parent_name %in%
aliquotingActivitiesOfInterest$name,]
# Organize required data into one dataframe
ibatPoolingData <- data.frame(sample_id = samplesOfInterest)
ibatPoolingData <- merge(ibatPoolingData, samplePoolingActivitiesOfInterest[, c("sample_id","name")], by="sample_id")
names(ibatPoolingData)[names(ibatPoolingData) == "name"] <- "pooled_id"
ibatPoolingData <- merge(ibatPoolingData, aliquotingActivitiesOfInterest[, c("sample_id","parent_name")], by="sample_id", all.x=TRUE)
names(ibatPoolingData)[names(ibatPoolingData) == "parent_name"] <- "bac_bior_id"
# Fill in missing combos
ibatPoolingData <- ibatPoolingData[, names(ibatPoolingData) != "sample_id"]
ibatPoolingData <- unique(ibatPoolingData)
ibatPoolingData$alpha <- 1
ibatPoolingData <- complete(ibatPoolingData, bac_bior_id, pooled_id, fill = list(alpha = 0.2))
ibatPoolingData$labelPosition = -0.6
# Plot pooled bacteria composition
plotPooledBacteriaComposition <- ggplot2::ggplot(ibatPoolingData) +
ggplot2::geom_point(
ggplot2::aes(
x = bac_bior_id, y = pooled_id, color = bac_bior_id, alpha = alpha,
size = 2
),
) +
ggplot2::theme_classic() +
ggplot2::theme(
legend.position = "none",
axis.ticks = ggplot2::element_blank(),
legend.title = ggplot2::element_text(face = "bold"),
axis.title = ggplot2::element_text(face = "bold"),
axis.text.x = ggplot2::element_blank(),
axis.text.y = ggplot2::element_blank(),
axis.ticks.y = ggplot2::element_blank(),
axis.line = ggplot2::element_blank(),
plot.margin = ggplot2::unit(c(0,-10,0,1), 'lines')
) +
ggplot2::coord_fixed(ratio = 1) +
ggplot2::scale_color_manual(values = kColorsBacteriaProductionBatches) +
ggplot2::geom_label(
data = ibatPoolingData,
ggplot2::aes(y = pooled_id, x = labelPosition, fill = pooled_id, label = pooled_id), color = "black", size = 2.5,
label.size = 0, label.padding = ggplot2::unit(3, "pt"), label.r = ggplot2::unit(0, "pt")
) +
ggplot2::scale_fill_manual(values = kColorsPooledBacteria) +
ggplot2::xlab("Production batch of bacteria") +
ggplot2::ylab("Source of food (pooled bacteria)") +
ggplot2::scale_x_discrete(expand = c(.4, 0)) + # adjust these to move labels
ggplot2::geom_segment(ggplot2::aes(x = .6, xend = 12, y = .4, yend = .4)) +
ggplot2::geom_vline(xintercept = .6, colour = "black")
# Adjust size of markers
#plotPooledBacteriaComposition$layers[[3]]$aes_params$size <- 2.5
# Make legend for component of food source
plotLegendComponentOfFoodSource <- ggplot2::ggplot(ibatPoolingData) +
ggplot2::geom_point(
ggplot2::aes(
x = bac_bior_id, y = pooled_id, color = bac_bior_id, alpha = alpha
), size = 2
) +
ggplot2::scale_alpha_continuous(limits = c(0,1), breaks = c(0.2,1), labels=c("TRUE","FALSE"))+
ggplot2::scale_color_manual(values = kColorsBacteriaProductionBatches) +
ggplot2::theme(
legend.title = ggplot2::element_text(face = "bold"),
legend.background = ggplot2::element_rect(colour = "#fafafa", fill = "#fafafa"),
plot.margin = ggplot2::unit(c(0,1,0,1), 'lines')
) +
ggplot2::guides(color = ggplot2::guide_legend(
title = "Component of food\nsource", title.position = "top",
ncol = 1
)) +
ggplot2::guides(alpha = ggplot2::guide_legend(title = "Component present\nin sourceof food",
title.position = "top",
face = "bold"))
# Get legend for component of food source
getLegendComponentOfFoodSource <- cowplot::get_legend(plotLegendComponentOfFoodSource)
# Arrange the plots and legend
ggpubr::ggarrange(plotPooledBacteriaComposition, getLegendComponentOfFoodSource)
Figure 4. Gouveia pooled individual aliquots (markers) into a single source of food (one row).
# Fetch the ibat study from workbench
summarizedExperimentIbat <- metabolomicsWorkbenchR::do_query(
context = "study",
input_item = "study_id",
input_value = "ST001726",
output_item = "SummarizedExperiment"
)
# Get the metabolite info
assayIbat <- SummarizedExperiment::assay(summarizedExperimentIbat)
rownames <- rownames(assayIbat) # Get metabolite ids
peakHeightsMetaboliteA <- assayIbat[1, ] # First metabolite
peakHeightsMetaboliteB <- assayIbat[2, ]
peakHeightsMetaboliteA <- do.call(rbind.data.frame, peakHeightsMetaboliteA)
peakHeightsMetaboliteA$sample_id <- colnames(summarizedExperimentIbat)
colnames(peakHeightsMetaboliteA)[1] <- "peak_height"
peakHeightsMetaboliteA$metabolite_name <- SummarizedExperiment::rowData(summarizedExperimentIbat)$metabolite_name[1]
peakHeightsMetaboliteB <- do.call(rbind.data.frame, peakHeightsMetaboliteB)
peakHeightsMetaboliteB$sample_id <- colnames(summarizedExperimentIbat)
colnames(peakHeightsMetaboliteB)[1] <- "peak_height"
peakHeightsMetaboliteB$metabolite_name <- SummarizedExperiment::rowData(summarizedExperimentIbat)$metabolite_name[2]
peakHeightsMetabolitesOfInterest <- rbind(peakHeightsMetaboliteA, peakHeightsMetaboliteB)
peakHeightsMetabolitesOfInterest <- peakHeightsMetabolitesOfInterest[
peakHeightsMetabolitesOfInterest$sample_id %in% c(
"aos107_ga_bku_NMR_D2O_80", "aos113_ga_ms1_NMR_D2O_124", "aos113_ga_ms1_NMR_D2O_28",
"aos113_ga_ms1_NMR_D2O_84", "aos129_ga_ms2_NMR_D2O_6", "aos140_ga_ms3_NMR_D2O_87",
"aos140_gt_ms3_NMR_D2O_130", "aos149_ga_ms2_NMR_D2O_9", "aos167_ga_ms3_NMR_D2O_31",
"aos167_ga_ms4_NMR_D2O_132", "aos180_ga_ms2_NMR_D2O_89", "aos181_ga_ms2_NMR_D2O_99",
"aos185_ga_ms2_NMR_D2O_51", "aos188_ga_ms3_NMR_D2O_36", "aos196_ga_ms2_NMR_D2O_101",
"aos196_ga_ms3_NMR_D2O_135", "aos198_ga_ms2_NMR_D2O_137", "aos200_ga_ms2_NMR_D2O_39",
"aos27_ga_ms3_NMR_D2O_11", "aos27_ga_ms4_NMR_D2O_52", "aos38_ga_ms2_NMR_D2O_57",
"aos41_ga_ms2_NMR_D2O_104", "aos43_ga_bku_NMR_D2O_60", "aos43_ga_ms4_NMR_D2O_13",
"aos44_ga_ms2_NMR_D2O_106", "aos53_ga_ms2_NMR_D2O_40", "aos54_ga_bku2_NMR_D2O_61",
"aos54_ga_bku_NMR_D2O_17", "aos54_ga_ms4_NMR_D2O_111", "aos69_ga_ms1_NMR_D2O_64",
"aos69_ga_ms3_NMR_D2O_114", "aos75_ga_ms4_NMR_D2O_115", "aos78_ga_ms4_NMR_D2O_22",
"aos87_co_ms3_NMR_D2O_72", "aos87_ga_ms3_NMR_D2O_118", "aos90_ga_ms3_NMR_D2O_66",
"aos92_ga_ms2_NMR_D2O_77", "aos99_ga_ms2_NMR_D2O_141", "aos99_ga_ms2_NMR_D2O_26",
"aos99_ga_ms2_NMR_D2O_68"
),
]
# get analytical sample id as available in original data
peakHeightsMetabolitesOfInterest$alias <- stringr::str_extract(
peakHeightsMetabolitesOfInterest$sample_id, "[^_]*_[^_]*_[^_]*"
)
Metadata in sCubed extend beyond critical study factors to more fully describe the complexity of physical material. Thus, sCubed enables scientists to employ a wide range of metadata to explore biological outcomes.
Here, we’ll look at biological outcomes for samples of worms that were analyzed via Nuclear Magnetic Resonance spectroscopy (NMR). Specifically, we’ll look at the peak heights of two metabolites. (These data were generated by Gouveia et al. (2021) and are publicly available in Metabolomics Workbench under study ID ST001726.)
Below we explore both the source of food and the minimum population
size (min. OD) for any component of each food source, all with respect
to two metabolites. We’ll label one specific sample,
aos129_ga_ms2_NMR_D2O_6
, to explore later.
# Get optical density activities
opticalDensityActivitiesOfInterest <- provenanceOfInterest[
provenanceOfInterest$parent == "optical density measurement",]
# Get only the last OD measure for each production batch of bacteria
opticalDensityActivitiesOfInterest <- opticalDensityActivitiesOfInterest %>%
group_by(person_special) %>% # Production batch of bacteria
slice(which.max(as.Date(start_date, '%Y-%m-%d')))
# Get only the last OD measure for each production batch of bacteria
opticalDensityActivitiesOfInterest <- merge(
opticalDensityActivitiesOfInterest[, c("optical_density_result", "parent_name", "person_special")],
aliquotingActivitiesOfInterest[, c("sample_id", "parent_name")],
by.x = "person_special",
by.y = "parent_name")
opticalDensityActivitiesOfInterest <- opticalDensityActivitiesOfInterest %>%
group_by(sample_id) %>% # Production batch of bacteria
slice(which.min(optical_density_result))
metabolitesWithOdMeasurements <- merge(
peakHeightsMetabolitesOfInterest, opticalDensityActivitiesOfInterest,
by.x = "alias", by.y = "sample_id")
metabolitesWithFoodSource <- merge(
metabolitesWithOdMeasurements, samplePoolingActivitiesOfInterest[, c("name", "sample_id")],
by.x = "alias", by.y = "sample_id")
plotMetabolitesWithFoodSource <- ggplot2::ggplot(
unique(metabolitesWithFoodSource), ggplot2::aes(x = metabolite_name, y = peak_height, fill = optical_density_result)
) +
ggplot2::geom_boxplot() +
ggplot2::geom_point(
ggplot2::aes(color = name, group = optical_density_result, x = metabolite_name, y = peak_height),
position = ggplot2::position_jitterdodge(jitter.width = .0, dodge.width = 0.75), size = 5
) +
ggplot2::geom_text(
data = subset(unique(metabolitesWithFoodSource), unique(metabolitesWithFoodSource)$alias == "aos129_ga_ms2"),
ggplot2::aes(label = "aos129_ga_ms2_\nNMR_D2O_6", x = metabolite_name, y = peak_height)#, hjust = 0, vjust = 0
) +
ggplot2::theme_classic() +
ggplot2::theme(
axis.title = ggplot2::element_text(face = "bold"),
legend.position = "top",
legend.title = ggplot2::element_text(face = "bold"),
legend.box = "vertical",
legend.box.just = "left",
text = ggplot2::element_text(size = 12),
legend.background = ggplot2::element_rect(colour = "#fafafa", fill = "#fafafa"),
rect = ggplot2::element_rect(fill = "transparent", colour = "transparent", linewidth = 0)) +
ggplot2::scale_color_manual(values = kColorsPooledBacteria) +
ggplot2::scale_fill_manual(values = kGreyScale) +
ggplot2::xlab("Name of metabolite") +
ggplot2::ylab("Height of peak") +
ggplot2::scale_y_continuous(limits = c(0, 250000000000)) +
ggplot2::scale_alpha(limits = c(0, 1)) +
ggplot2::guides(
color = ggplot2::guide_legend(
title = "Source of food", title.position = "top",
nrow = 3, order = 1
),
fill = ggplot2::guide_legend(
title = "Min. OD for any component of food source", title.position = "top",
override.aes = list(size = .22)
)
)
plotMetabolitesWithFoodSource
Figure 5. Each sample of worms (one marker) received
one source of food. Many sources of food were used to grow worms. A
single sample, aos129_ga_ms2_NMR_D2O_6
is labeled.
Next, we can more specifically look at which components of each food source have the lowest population size.
plotMetabolitesWithMinOD <- ggplot2::ggplot(
unique(metabolitesWithOdMeasurements),
ggplot2::aes(x = metabolite_name, y = peak_height, fill = optical_density_result)
) +
ggplot2::geom_boxplot() +
ggplot2::geom_point(
ggplot2::aes(color = person_special, group = optical_density_result),
position = ggplot2::position_jitterdodge(jitter.width = .0, dodge.width = 0.75), size = 5
) +
ggplot2::geom_text(
data = subset(unique(metabolitesWithOdMeasurements), unique(metabolitesWithOdMeasurements)$alias == "aos129_ga_ms2"),
ggplot2::aes(label = "aos129_ga_ms2_\nNMR_D2O_6", x = metabolite_name, y = peak_height)#, hjust = 0, vjust = 0
) +
ggplot2::theme_classic() +
ggplot2::theme(
axis.title = ggplot2::element_text(face = "bold"),
legend.position = "top",
legend.title = ggplot2::element_text(face = "bold"),
legend.text = ggplot2::element_text(size = 8),
legend.box = "vertical",
legend.box.just = "left",
text = ggplot2::element_text(size = 12),
legend.background = ggplot2::element_rect(colour = "#fafafa", fill = "#fafafa"),
rect = ggplot2::element_rect(fill = "transparent", colour = "transparent", linewidth = 0)
) +
ggplot2::scale_color_manual(values = kColorsBacteriaProductionBatches) +
ggplot2::scale_fill_manual(values = kGreyScale) +
ggplot2::xlab("Name of metabolite") +
ggplot2::ylab("Height of peak") +
ggplot2::scale_y_continuous(limits = c(0, 250000000000)) +
ggplot2::guides(
color = ggplot2::guide_legend(
title = "Component of food source", title.position = "top",
nrow = 2, order = 1
),
fill = ggplot2::guide_legend(
title = "Min. OD for any component of food source", title.position = "top",
override.aes = list(size = .22)
)
)
plotMetabolitesWithMinOD
Figure 6. Many samples of worms (markers) shared
components in their sources of food. The production batch of bacteria
that had the lowest OD measure at the time of harvesting the bacteria is
represented as the color of markers. A single sample,
aos129_ga_ms2_NMR_D2O_6
is labeled.
scatter <- ggplot2::ggplot(unique(subntest), ggplot2::aes(x = start_date, y = as.numeric(optical_density_result), color = bac_bior_id)) + # ,aes(fill=workbench) label=alias
ggplot2::geom_jitter(ggplot2::aes(x = start_date, y = as.numeric(optical_density_result)), size = 5, alpha = 0.75, position = ggplot2::position_jitter(seed = 1)) +
ggplot2::annotate("rect",
fill = "pink", alpha = 0.25,
xmin = "2019-08-01", xmax = "2019-10-30",
ymin = -Inf, ymax = Inf
) + # alpha=workbench
# geom_text(position = position_jitter(seed = 1))+
ggplot2::geom_text(
data = subset(unique(subntest), unique(subntest)$alias == "aos129_ga_ms2"), show.legend = FALSE,
ggplot2::aes(label = "aos129_ga_ms2_NMR_D2O_6", x = start_date, y = as.numeric(optical_density_result), color = "black", hjust = 0, vjust = 0)
) +
ggplot2::ylab("Min. OD for any\ncomponent of food source") +
ggplot2::xlab("Date source of food was seeded") +
# ggsci::scale_color_ucscgb() +
ggplot2::theme_classic() +
ggplot2::theme(
axis.title = ggplot2::element_text(face = "bold"),
legend.position = "top",
legend.title = ggplot2::element_text(face = "bold"),
legend.background = ggplot2::element_rect(colour = "#fafafa", fill = "#fafafa"),
legend.text = ggplot2::element_text(size = 8),
rect = ggplot2::element_rect(fill = "transparent", colour = "transparent", linewidth = 0)
) +
ggplot2::geom_hline(
yintercept = 20, linetype = "solid",
color = "pink", size = 1
) +
# guides(color ="none") +
ggplot2::scale_x_discrete(guide = ggplot2::guide_axis(n.dodge = 2)) +
ggplot2::scale_y_continuous(limits = c(0, 23)) + # fix as.numeric
ggplot2::scale_color_manual(values = kColorsBacteriaProductionBatches) +
ggplot2::guides(color = ggplot2::guide_legend(
title = "Component of food source", title.position = "top",
nrow = 3, order = 1
))
scatter
The primary purpose of R/SquareCube is to enable scientists to retrieve the provenances of physical materials in sCubed. Many of the visualizations that we’ve looked at rely on the ability of R/SquareCube to fetch provenance. Now, let’s take a look at what constitutes the provenance of physical material in sCubed.
To gain an overview of the provenance for one physical material, we ask,
“What physical materials, planned processes, executed processes, and investigators describe how sample ‘aos129_ga_ms2’ was produced and processed?”
Visualize provenance using a network where
Interact with the network:
# define our sample of interest
sampleName <- "aos129_ga_ms2"
# get the provenance of sample
provenance <- sCubed::GetProvenance(transformedProcessExecutionSheet, sampleName)
# prepare data for network
prepped_provenance_data_network <- prep_exec_proc_network(provenance)
# generate network
charge <- -30
height <- 650
width <- 1000
plotProvenanceNetwork <- sCubed::GenerateNetwork (prepped_provenance_data_network, charge, height, width)
# plot network
plotProvenanceNetwork
Figure 8. A network of entities gives an overview of
how one sample aos129_ga_ms2
was produced.
“Within the provenance of ‘aos129_ga_ms2’, what’s an example of an executed process?”
# filter provenance to only include the target instance of a reporting workflow
targetWorkflowInstanceId <- "bac_7,Goncalo's standard e. coli bioreactor,2019-07-23 11:59:00"
oneExecutedProcess <- filter(provenance, workflow_instance_id == targetWorkflowInstanceId)
# further filter provenance to one specific step
targetStepNumber <- 3
oneExecutedProcess <- filter(oneExecutedProcess, workflow_instance_step_number == targetStepNumber)
# prep data for network
preppedOneExecutedProcessDataNetwork <- prep_exec_proc_network(oneExecutedProcess)
# generate network
charge <- -30
height <- 400
width <- 200
plotNetworkOneExecutedProcess <- sCubed::GenerateNetwork (preppedOneExecutedProcessDataNetwork, charge, height, width)
customJS <-
'function(el) {
d3.select(el).select(".zoom-layer").attr("transform", "scale(1)")
}'
# plot network
htmlwidgets::onRender(plotNetworkOneExecutedProcess, customJS)
Figure 9. A network of entities gives an overview of one executed process.
“Within the provenance of ‘aos129_ga_ms2’, what’s an example of a planned process?”
# TODO make generating this net dynamic
# define the identifier for the target planned process
targetProcessId <- "fermentation_bacteria_standard"
# filtered all planned processes to only the targeted planned process
onePlannedProcess <- dplyr::filter(
processDefinitionSpreadsheet,
process_id == targetProcessId
)
# remove columns with na
onePlannedProcess <- onePlannedProcess[, colSums(is.na(onePlannedProcess)) < nrow(onePlannedProcess)]
# remove empty columns
onePlannedProcess <- onePlannedProcess[!sapply(onePlannedProcess, function(x) {
all(x == "")
})]
attributesOnePlannedProcess <- colnames(onePlannedProcess)
sourceOfAttributesOnePlannedProcess <- rep(
"fermentation_bacteria_standard",
length(attributesOnePlannedProcess)
)
valuesOnePlannedProcess <- unlist(c(onePlannedProcess[1, ]))
onePlannedProcessDataNetwork <- data.frame(
source = c(
sourceOfAttributesOnePlannedProcess,
attributesOnePlannedProcess
),
destination = c(attributesOnePlannedProcess, valuesOnePlannedProcess),
sourceGroup = c(
rep(
"planned process",
length(sourceOfAttributesOnePlannedProcess)
),
rep(
"attribute", length(attributesOnePlannedProcess)
)
),
destinationGroup = c(
rep(
"attribute", length(attributesOnePlannedProcess)
),
rep(
"value", length(valuesOnePlannedProcess)
)
)
)
groupData <- data.frame(
label = c(
onePlannedProcessDataNetwork$source,
onePlannedProcessDataNetwork$destination
),
group = c(
onePlannedProcessDataNetwork$sourceGroup,
onePlannedProcessDataNetwork$destinationGroup
)
)
sources <- onePlannedProcessDataNetwork %>%
dplyr::distinct(source) %>%
dplyr::rename(label = source)
destinations <- onePlannedProcessDataNetwork %>%
dplyr::distinct(destination) %>%
dplyr::rename(label = destination)
nodes <- dplyr::full_join(sources, destinations, by = "label")
nodes <- nodes %>% tibble::rowid_to_column("id")
nodesD3 <- dplyr::mutate(nodes, id = id - 1)
group <- groupData$group[match(nodesD3$label, groupData$label)]
nodesD3$group <- group
nodesD3$node_size <- rep(10, length(nodesD3$group))
per_route <- onePlannedProcessDataNetwork %>%
dplyr::group_by(source, destination) %>%
ungroup()
edges <- per_route %>%
left_join(nodes, by = c("source" = "label")) %>%
dplyr::rename(from = id)
edges <- edges %>%
dplyr::left_join(nodes, by = c("destination" = "label")) %>%
dplyr::rename(to = id)
edges <- select(edges, from, to)
edgesD3 <- dplyr::mutate(edges, from = from - 1, to = to - 1)
edgesD3$weight <- rep(2, length(edgesD3$from))
ColourScale <- 'd3.scaleOrdinal()
.domain(["workflow management","conceptual entity","material", "planned process","executed process",
"attribute","value"])
.range(["#beafe1","#c2b280","#05d0eb", "#a6e000","#56d876","#000000","#666666"]);'
networkOnePlannedProcess <- networkD3::forceNetwork(
Links = edgesD3,
Nodes = nodesD3,
Source = "from",
Target = "to",
NodeID = "label",
Group = "group",
Value = "weight",
opacity = 1,
fontSize = 16,
zoom = TRUE,
arrows = FALSE,
colourScale = htmlwidgets::JS(ColourScale),
legend = F,
fontFamily = "Arial",
opacityNoHover = 1,
charge = -900,
height = 800,
width = 800
)
# https://stackoverflow.com/questions/71742096/r-forcenetwork-how-do-i-keep-the-legend-in-the-top-left-corner-when-zooming-is
networkOnePlannedProcess <- htmlwidgets::onRender(
networkOnePlannedProcess,
jsCode = '
function (el, x) {
d3.select("svg").append("g").attr("id", "legend-layer");
var legend_layer = d3.select("#legend-layer");
d3.selectAll(".legend")
.each(function() { legend_layer.append(() => this); });
}
'
)
networkOnePlannedProcess
Figure 10. A network of entities (green), attributes (black), and values (grey) gives an overview of one planned process.
Together, R/SquareCube and sCubed enable scientists to explore, monitor, and share complex information about how investigations were conducted. When scientists can efficiently capture and fetch the provenances of physical materials, scientists can harness information to execute and share investigations, including
In turn, R/SquareCube and sCubed can help scientists minimize redundant experiments; reduce the time required to execute investigations; and increase use and reuse of analytical data, protocols, and study designs.
R/SquareCube and sCubed are currently being validated within the CIDC-UGA use-case and with consideration towards extending to other investigative subjects and experimental contexts. While our existing work concentrates on the provenance of phyisical material, our upcoming work aims to help scientists navigate across digital resources, e.g.
#### navigate across resources--BOGUS DATA
# src & target aren't accurate here. generated quickly to do basic problem solving
src <- c( "Set A", "Set 1", "Batch 1", "Batch 1",
"analytical sample 1", "analytical sample 2", "raw data 1",
"Set 1","raw data 2","Natural Strains (NS)","Natural Strains (NS)","Anchor manuscript",
"protocol 1","Anchor manuscript", "Anchor manuscript",
"analytical sample 1", "analytical sample 2","raw data 3","raw data 4",
"analytical sample 3", "analytical sample 4","analytical sample 3", "analytical sample 4",
"analytical sample 3", "analytical sample 4",
"raw data 6","raw data 5","raw data 7","raw data 8",
"analytical sample 3", "analytical sample 4","Set 2","Set 2",
"analytical sample 5", "analytical sample 6","raw data 9","raw data 10",
"analytical sample 5", "analytical sample 6","raw data 11", "raw data 12",
"analytical sample 5", "analytical sample 6","protocol 1","protocol 1",
"analytical sample 7","analytical sample 8","analytical sample 9",
"analytical sample 7","analytical sample 8","analytical sample 9",
"analytical sample 5", "analytical sample 6",
"analytical sample 7","analytical sample 8","analytical sample 9",
"raw data 13","raw data 14","raw data 15","Set 3","Set B","Set C")
target <- c( "Set 1", "Batch 1", "analytical sample 1", "analytical sample 2",
"raw data 1", "raw data 2", "Metabolomics Workbench, 1",
"Batch 2","Metabolomics Workbench, 2","analytical sample 1","analytical sample 2",
"Natural Strains (NS)","Natural Strains (NS)","UDP-glycosyltransferases (UGT) mutants",
"central metabolism (CM) mutants", "raw data 3","raw data 4",
"Metabolomics Workbench, 2", "Metabolomics Workbench, 1",
"UDP-glycosyltransferases (UGT) mutants","UDP-glycosyltransferases (UGT) mutants",
"raw data 5","raw data 6","raw data 7","raw data 8",
"Metabolomics Workbench, 2", "Metabolomics Workbench, 1",
"Metabolomics Workbench, 2", "Metabolomics Workbench, 1",
"Batch 3","Batch 3","Batch 3","Batch 3", "raw data 9","raw data 10",
"Metabolomics Workbench, 1","Metabolomics Workbench, 1",
"raw data 11", "raw data 12","Metabolomics Workbench, 2","Metabolomics Workbench, 2",
"central metabolism (CM) mutants", "central metabolism (CM) mutants",
"central metabolism (CM) mutants","UDP-glycosyltransferases (UGT) mutants",
"central metabolism (CM) mutants","UDP-glycosyltransferases (UGT) mutants",
"Natural Strains (NS)", "Batch 1", "Batch 3","Batch 6","Batch 6","Batch 6",
"raw data 13","raw data 14","raw data 15",
"Metabolomics Workbench, 0","Metabolomics Workbench, 0","Metabolomics Workbench, 0",
"Batch 6","Set 2","Set 3")
networkData <- data.frame(src, target, stringsAsFactors = FALSE, directed = TRUE)
nodes <- data.frame(name = unique(c(src, target)), stringsAsFactors = FALSE)
nodes$id <- 0:(nrow(nodes) - 1)
# create a data frame of the edges that uses id 0:9 instead of their names
edges <- networkData %>%
left_join(nodes, by = c("src" = "name")) %>%
select(-src) %>%
rename(source = id) %>%
left_join(nodes, by = c("target" = "name")) %>%
select(-target) %>%
rename(target = id)
edges$width <- 1
# make a grouping variable that will match to colours
#nodes$group <- ifelse(nodes$name %in% src, "lions", "tigers")
nodes$group <- nodes$name
nodes$size <- 5
ColourScale <- 'd3.scaleOrdinal()
.domain(["Set A", "First study","Batch 1","Set 1",
"analytical sample 1","analytical sample 2","raw data 1","Metabolomics Workbench, 1",
"raw data 2","Batch 2","Metabolomics Workbench, 2","Natural Strains (NS)",
"protocol 1","Anchor manuscript","UDP-glycosyltransferases (UGT) mutants",
"Anchor study","central metabolism (CM) mutants","raw data 3",
"raw data 4","raw data 5","raw data 6","raw data 7","raw data 8",
"analytical sample 3","analytical sample 4","Set 3","Batch 3",
"analytical sample 5","analytical sample 6","raw data 9","raw data 10",
"raw data 11","raw data 12","Metabolomics Workbench, 0","analytical sample 7",
"analytical sample 8","analytical sample 9","Batch 6",
"raw data 13","raw data 14","raw data 15","Set 3"])
.range(["orange", "orange","#05d0eb","#05d0eb",
"#05d0eb", "#05d0eb","black","purple","black","#05d0eb","purple","tan",
"deeppink","deeppink","tan","tan","tan","black","black","black","black","black","black",
"#05d0eb", "#05d0eb", "#05d0eb", "#05d0eb", "#05d0eb", "#05d0eb", "black",
"black", "black", "black","purple","#05d0eb", "#05d0eb", "#05d0eb","#05d0eb",
"black","black","black","#05d0eb"]);'
navigateNetwork <-
networkD3::forceNetwork(
Links = edges,
Nodes = nodes,
Source = "source",
Target = "target",
NodeID = "name",
Group = "group",
Value = "width",
opacity = 0.9,
zoom = TRUE,
colourScale = networkD3::JS(ColourScale),
opacityNoHover = FALSE,
Nodesize = "size",
fontSize = 14,
charge = -300,
height = 500,
width = 1300
)
customJS <-
'function(el) {
d3.select(el).select(".zoom-layer").attr("transform", "scale(0.5)")
}'
htmlwidgets::onRender(navigateNetwork, customJS)
Figure 13. A network illustrates relationships among both digital resources, study groups, and samples. Analytical data (black nodes), procotols, and mansuscripts (pink nodes) are available across different digital resources. Studies may be defined differently in repositories vs. within manuscripts, e.g. the CIDC-UGA established three gentic studies (tan nodes) and deposited data from each genetic study into three studies on Metabolomics Workbench (purple nodes). Scientists often further group samples into blocks or batches (blue nodes), e.g. ‘Batch 1’, ‘Set 1’. Further, scientists often alias samples or experimental groups (orange nodes). A network can help scientists understand how samples were grouped and aliased and enable scientitists to navigate across studies and digital resources with ease.