library(dplyr)
library(readr)
library(dotenv)
library(jsonlite)

# --- Load environment locally ID list ---
if (file.exists(".env"))
    load_dot_env(file = ".env")

# --- Load dataset ID list ---
dataset_env <- Sys.getenv("RAISE_DATASET_ID_LIST")
if (dataset_env == "") {
  stop("Environment variable RAISE_DATASET_ID_LIST is missing.")
}

dataset_ids <- fromJSON(dataset_env)
if (length(dataset_ids) == 0) {
  stop("RAISE_DATASET_ID_LIST is empty or invalid JSON.")
}

# Build the CSV path
csv_file_path <- file.path(dataset_ids[[1]], "datafile.csv")
if (!file.exists(csv_file_path)) {
  stop(paste("CSV file not found:", csv_file_path))
}

cat("Loading CSV from:", csv_file_path, "\n")

# --- Read CSV ---
iris_data <- read_csv(csv_file_path, show_col_types = FALSE)

# First rows
print(head(iris_data))

# Summary statistics
summary_stats <- capture.output(summary(iris_data))
print(summary_stats)

# --- Species means (lowercase columns!) ---
species_means <- iris_data %>%
  group_by(species) %>%
  summarise(
    sepal_length = mean(sepal_length, na.rm = TRUE),
    sepal_width  = mean(sepal_width, na.rm = TRUE),
    petal_length = mean(petal_length, na.rm = TRUE),
    petal_width  = mean(petal_width, na.rm = TRUE)
  )

print(species_means)

# --- Correlation matrix ---
numeric_cols <- iris_data %>% select(where(is.numeric))
cor_matrix <- cor(numeric_cols, use = "complete.obs")
print(cor_matrix)

# --- Create results directory ---
if (!dir.exists("results")) {
  dir.create("results", recursive = TRUE)
}

# --- Write summary files ---
writeLines(summary_stats, "results/summary_stats.txt")

write.table(
  species_means,
  file = "results/species_means.txt",
  sep = "\t",
  row.names = FALSE,
  quote = FALSE
)

write.table(
  cor_matrix,
  file = "results/correlation_matrix.txt",
  sep = "\t",
  col.names = NA,
  quote = FALSE
)

# --- Safe plotting ----
species_factor <- factor(iris_data$species)
species_colors <- c("setosa" = "red", "versicolor" = "blue", "virginica" = "darkgreen")
point_colors <- species_colors[as.character(species_factor)]

# For extremely large datasets (14M rows)
# Optionally sample for plotting:
plot_sample <- iris_data %>% sample_n(min(200000, nrow(iris_data)))

png("results/sepal_vs_petal.png", width = 1000, height = 800)
plot(
  plot_sample$sepal_length,
  plot_sample$petal_length,
  col = species_colors[plot_sample$species],
  pch = 19,
  cex = 0.4,
  main = "Sepal Length vs Petal Length (sample of dataset)",
  xlab = "Sepal Length",
  ylab = "Petal Length"
)
legend(
  "topleft",
  legend = names(species_colors),
  col = species_colors,
  pch = 19,
  cex = 1.1
)
dev.off()

cat("Results written to results/ directory\n")
