Skip to content

R script

How to create a R script for multiple datasets execution

This code is an implementation of a NodeJS main script. It demonstrates how to read data from files, process it, and generate results in different formats.

In this example, we will have this directory structure:

|-- main.js
|-- package.json
|-- package-lock.json
  • The main.js, package.json and package-lock.json files are mandatory and must have these specific filenames.
  • Other files can be added to use different nodejs modules

Import Required Libraries

There are no specific required libraries, you can import any library for your own use.

Load Datasets Dynamically

Only when working locally, you’ll have to download and organize your data files according to the required folder structure. (see also Dynamic Dataset(s) Accessing.) Each data file should be stored in a folder named after its ID (the data file’s UUID can be found in its metadata) in the same directory with the main.py script. Then, create a .env file that defines the variable RAISE_DATASET_ID_LIST. This variable should contain an ordered list of the dataset UUIDs in the following format:

RAISE_DATASET_ID_LIST=["00000000-0000-0000-0000-000000000001","00000000-0000-0000-0000-000000000002"]

Read Data Files

Once your experimental environment is ready, you can read the files:

dataset_env <- Sys.getenv("RAISE_DATASET_ID_LIST")
if (dataset_env == "") {
stop("Environment variable RAISE_DATASET_ID_LIST is missing.")
}
dataset_ids <- fromJSON(dataset_env)
if (length(dataset_ids) == 0) {
stop("RAISE_DATASET_ID_LIST is empty or invalid JSON.")
}
# Build the CSV path
csv_file_path <- file.path(dataset_ids[[1]], "datafile.csv")

The data file must always be named “datafile” with the appropriate extension: e.g., .csv, .txt, .edf, .json… There is no limit to the number of datasets that can be loaded.

Gather Results

Any type of file can be taken as a result -> images, CSV, text… Moreover, the number of results is not limited. It is worth noting that the results must now be stored under the “results” directory (it is at the same level as the main file execution).

# --- Create results directory ---
if (!dir.exists("results")) {
dir.create("results", recursive = TRUE)
}
# --- Write summary files ---
writeLines(summary_stats, "results/summary_stats.txt")

Complete main.R script

library(dplyr)
library(readr)
library(jsonlite)
# --- Load dataset ID list ---
dataset_env <- Sys.getenv("RAISE_DATASET_ID_LIST")
if (dataset_env == "") {
stop("Environment variable RAISE_DATASET_ID_LIST is missing.")
}
dataset_ids <- fromJSON(dataset_env)
if (length(dataset_ids) == 0) {
stop("RAISE_DATASET_ID_LIST is empty or invalid JSON.")
}
# Build the CSV path
csv_file_path <- file.path(dataset_ids[[1]], "datafile.csv")
if (!file.exists(csv_file_path)) {
stop(paste("CSV file not found:", csv_file_path))
}
cat("Loading CSV from:", csv_file_path, "\n")
# --- Read CSV ---
iris_data <- read_csv(csv_file_path, show_col_types = FALSE)
# First rows
print(head(iris_data))
# Summary statistics
summary_stats <- capture.output(summary(iris_data))
print(summary_stats)
# --- Species means (lowercase columns!) ---
species_means <- iris_data %>%
group_by(species) %>%
summarise(
sepal_length = mean(sepal_length, na.rm = TRUE),
sepal_width = mean(sepal_width, na.rm = TRUE),
petal_length = mean(petal_length, na.rm = TRUE),
petal_width = mean(petal_width, na.rm = TRUE)
)
print(species_means)
# --- Correlation matrix ---
numeric_cols <- iris_data %>% select(where(is.numeric))
cor_matrix <- cor(numeric_cols, use = "complete.obs")
print(cor_matrix)
# --- Create results directory ---
if (!dir.exists("results")) {
dir.create("results", recursive = TRUE)
}
# --- Write summary files ---
writeLines(summary_stats, "results/summary_stats.txt")
write.table(
species_means,
file = "results/species_means.txt",
sep = "\t",
row.names = FALSE,
quote = FALSE
)
write.table(
cor_matrix,
file = "results/correlation_matrix.txt",
sep = "\t",
col.names = NA,
quote = FALSE
)
# --- Safe plotting ----
species_factor <- factor(iris_data$species)
species_colors <- c("setosa" = "red", "versicolor" = "blue", "virginica" = "darkgreen")
point_colors <- species_colors[as.character(species_factor)]
# For extremely large datasets (14M rows)
# Optionally sample for plotting:
plot_sample <- iris_data %>% sample_n(min(200000, nrow(iris_data)))
png("results/sepal_vs_petal.png", width = 1000, height = 800)
plot(
plot_sample$sepal_length,
plot_sample$petal_length,
col = species_colors[plot_sample$species],
pch = 19,
cex = 0.4,
main = "Sepal Length vs Petal Length (sample of dataset)",
xlab = "Sepal Length",
ylab = "Petal Length"
)
legend(
"topleft",
legend = names(species_colors),
col = species_colors,
pch = 19,
cex = 1.1
)
dev.off()
cat("Results written to results/ directory\n")

renv.lock file

{
"R": {
"Version": "4.3.1",
"Repositories": [
{
"Name": "CRAN",
"URL": "https://cloud.r-project.org"
}
]
},
"Packages": {
"renv": {
"Package": "renv",
"Version": "1.0.7",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "1bbf3e5c0cd0f01f7af5120a0610b4f8"
},
"dplyr": {
"Package": "dplyr",
"Version": "1.1.4",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "2f99532956543d4d6acdfb36f996bbdd",
"Requirements": [
"R",
"generics",
"glue",
"lifecycle",
"magrittr",
"pillar",
"pkgconfig",
"rlang",
"tibble",
"tidyselect",
"vctrs"
]
},
"readr": {
"Package": "readr",
"Version": "2.1.4",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "b9fb44bc4af0a2c595d4a1a278e6e0d4",
"Requirements": [
"R",
"cli",
"cpp11",
"hms",
"lifecycle",
"methods",
"R6",
"rlang",
"tibble",
"tzdb",
"vroom"
]
},
"jsonlite": {
"Package": "jsonlite",
"Version": "1.8.8",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "408dc3a39de6a5f1a78bbf1bbffcfbbd",
"Requirements": [ "R", "methods" ]
},
"rlang": {
"Package": "rlang",
"Version": "1.1.3",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "1980e0b82466e0e5ebc41d434f630e35"
},
"tibble": {
"Package": "tibble",
"Version": "3.2.1",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "a37e24f3fa1be13c2447b5f0de73c8dc"
},
"magrittr": {
"Package": "magrittr",
"Version": "2.0.3",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "7f53e31f629c00e572d2c6b0b2d93e6c"
},
"pillar": {
"Package": "pillar",
"Version": "1.9.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "bc09583a8e33a4105d085d2a8b6548bd"
},
"vctrs": {
"Package": "vctrs",
"Version": "0.6.5",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "71777b480bd4f5239fef37056ea3fa5f"
},
"tidyselect": {
"Package": "tidyselect",
"Version": "1.2.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "19f8edb882c88f08f0b4ae126a7c5d81"
},
"cli": {
"Package": "cli",
"Version": "3.6.2",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "d0f50c1df2ad1fb36793fadc7d135d63"
},
"cpp11": {
"Package": "cpp11",
"Version": "0.4.7",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "9cefa29fb7f8e3dd4979214f99cdaf3e"
},
"hms": {
"Package": "hms",
"Version": "1.1.3",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "6cb4b8b85bc0ff97b7d999bbb1b7c327"
},
"tzdb": {
"Package": "tzdb",
"Version": "0.4.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "25b4e3c39a0e833f612e6dee9d1b8c49"
},
"vroom": {
"Package": "vroom",
"Version": "1.6.5",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "7426b2e050fb75ebb0cbd6ef8d7311ca"
},
"glue": {
"Package": "glue",
"Version": "1.7.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "df2d6244ae87f1c5c61dad994c3335e1"
},
"lifecycle": {
"Package": "lifecycle",
"Version": "1.0.4",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "ccc7b5d98e3e0c7bd70cbd07e66dbe69"
},
"pkgconfig": {
"Package": "pkgconfig",
"Version": "2.0.3",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "59bc55f3a64f490550cc7295abd5fb5f"
},
"R6": {
"Package": "R6",
"Version": "2.5.1",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "470851b552721c432788bd0a73b950c6"
}
}
}

Logs

Finally, the log system has been improved. In the event that the experiment does not fail, the user’s print()” in the script will be logged. In the case of an error, the logs save the exact error that caused the execution to fail.
In case the main.js execution fails, you will be able to see the exact reason of the failure (wrongly defined variables, unexpected indentations…).
In the case where the creation of the child container is not successful the logs will contain the reason for the failure (incompatible versions in the requirements txt, non-existing package versions…).\

You can find some examples at the templates section.