Tidy Tuesday Exercise

Author

Elizabeth Hall

Exercise 13

Setup and Cleaning

First, the required libraries were loaded.

# Load required libraries
library(tidytuesdayR)
library(tidyverse)
library(jsonlite)
library(janitor)
library(here)
library(fs)

Then the data was loaded in and seperated.

# Load data (code from tinytuesday github)
tuesdata <- tidytuesdayR::tt_load('2024-04-09')


    Downloading file 1 of 4: `eclipse_annular_2023.csv`
    Downloading file 2 of 4: `eclipse_total_2024.csv`
    Downloading file 3 of 4: `eclipse_partial_2023.csv`
    Downloading file 4 of 4: `eclipse_partial_2024.csv`

# Seperate data (code from tinytuesday github)
eclipse_annular_2023 <- tuesdata$eclipse_annular_2023
eclipse_total_2024 <- tuesdata$eclipse_total_2024
eclipse_partial_2023 <- tuesdata$eclipse_partial_2023
eclipse_partial_2024 <- tuesdata$eclipse_partial_2024

The code was then cleaned using the cleaning script provided with the data.

# Initial cleaning script (code from tinytuesday github)
working_dir <- here::here("tidytuesday-exercise", "eclipse-data")

eclipse_cities_url_2024 <- "https://svs.gsfc.nasa.gov/vis/a000000/a005000/a005073/cities-eclipse-2024.json"
eclipse_cities_url_2023 <- "https://svs.gsfc.nasa.gov/vis/a000000/a005000/a005073/cities-eclipse-2023.json"

eclipse_cities_2024 <- jsonlite::fromJSON(eclipse_cities_url_2024) |> 
  tibble::as_tibble() |> 
  janitor::clean_names() |> 
  tidyr::unnest_wider(eclipse, names_sep = "_")

eclipse_total_2024 <- eclipse_cities_2024 |> 
  dplyr::filter(!is.na(eclipse_6))

eclipse_partial_2024 <- eclipse_cities_2024 |> 
  dplyr::filter(is.na(eclipse_6)) |> 
  dplyr::select(-eclipse_6)

eclipse_cities_2023 <- jsonlite::fromJSON(eclipse_cities_url_2023) |> 
  tibble::as_tibble() |> 
  janitor::clean_names() |> 
  tidyr::unnest_wider(eclipse, names_sep = "_")

eclipse_annular_2023 <- eclipse_cities_2023 |> 
  dplyr::filter(!is.na(eclipse_6))

eclipse_partial_2023 <- eclipse_cities_2023 |> 
  dplyr::filter(is.na(eclipse_6)) |> 
  dplyr::select(-eclipse_6)

readr::write_csv(
  eclipse_total_2024,
  fs::path(working_dir, "eclipse_total_2024.csv")
)
readr::write_csv(
  eclipse_partial_2024,
  fs::path(working_dir, "eclipse_partial_2024.csv")
)

readr::write_csv(
  eclipse_annular_2023,
  fs::path(working_dir, "eclipse_annular_2023.csv")
)
readr::write_csv(
  eclipse_partial_2023,
  fs::path(working_dir, "eclipse_partial_2023.csv")
)

Exploratory Data Analysis

First, the required libraries were loaded.

# Load required libraries
library(patchwork)
library(cowplot)
library(ggplot2)
library(maps)

Print summary statistics and counts of observation by state.

# Print summary statistics
summary(eclipse_total_2024)

    state               name                lat             lon         
 Length:3330        Length:3330        Min.   :28.45   Min.   :-101.16  
 Class :character   Class :character   1st Qu.:35.42   1st Qu.: -92.41  
 Mode  :character   Mode  :character   Median :39.24   Median : -86.56  
                                       Mean   :38.33   Mean   : -86.93  
                                       3rd Qu.:41.22   3rd Qu.: -82.31  
                                       Max.   :46.91   Max.   : -67.43  
  eclipse_1          eclipse_2          eclipse_3          eclipse_4        
 Length:3330        Length:3330        Length:3330        Length:3330       
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
  eclipse_5          eclipse_6        
 Length:3330        Length:3330       
 Class :character   Class :character  
 Mode  :character   Mode  :character

summary(eclipse_partial_2024)

    state               name                lat             lon         
 Length:28844       Length:28844       Min.   :17.96   Min.   :-176.60  
 Class :character   Class :character   1st Qu.:35.24   1st Qu.: -99.08  
 Mode  :character   Mode  :character   Median :39.52   Median : -90.30  
                                       Mean   :38.76   Mean   : -93.00  
                                       3rd Qu.:42.04   3rd Qu.: -81.16  
                                       Max.   :71.25   Max.   : 174.11  
  eclipse_1          eclipse_2          eclipse_3          eclipse_4        
 Length:28844       Length:28844       Length:28844       Length:28844      
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
  eclipse_5        
 Length:28844      
 Class :character  
 Mode  :character

summary(eclipse_annular_2023)

    state               name                lat             lon         
 Length:811         Length:811         Min.   :27.22   Min.   :-124.45  
 Class :character   Class :character   1st Qu.:31.30   1st Qu.:-111.98  
 Mode  :character   Mode  :character   Median :35.42   Median :-106.70  
                                       Mean   :35.41   Mean   :-108.05  
                                       3rd Qu.:38.42   3rd Qu.:-101.36  
                                       Max.   :44.87   Max.   : -96.72  
  eclipse_1          eclipse_2          eclipse_3          eclipse_4        
 Length:811         Length:811         Length:811         Length:811        
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
  eclipse_5          eclipse_6        
 Length:811         Length:811        
 Class :character   Class :character  
 Mode  :character   Mode  :character

summary(eclipse_partial_2023)

    state               name                lat             lon         
 Length:31363       Length:31363       Min.   :17.96   Min.   :-176.60  
 Class :character   Class :character   1st Qu.:35.36   1st Qu.: -97.50  
 Mode  :character   Mode  :character   Median :39.56   Median : -89.26  
                                       Mean   :38.80   Mean   : -91.97  
                                       3rd Qu.:41.93   3rd Qu.: -81.14  
                                       Max.   :71.25   Max.   : 174.11  
  eclipse_1          eclipse_2          eclipse_3          eclipse_4        
 Length:31363       Length:31363       Length:31363       Length:31363      
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
  eclipse_5        
 Length:31363      
 Class :character  
 Mode  :character

# Count of observations by state
eclipse_total_2024 %>% count(state) %>% arrange(desc(n))

# A tibble: 14 × 2
   state     n
   <chr> <int>
 1 OH      693
 2 TX      591
 3 IN      586
 4 NY      403
 5 AR      365
 6 IL      252
 7 MO      168
 8 VT       87
 9 PA       71
10 OK       41
11 KY       38
12 ME       28
13 NH        4
14 MI        3

eclipse_partial_2024 %>% count(state) %>% arrange(desc(n))

# A tibble: 52 × 2
   state     n
   <chr> <int>
 1 PA     1817
 2 CA     1611
 3 TX     1268
 4 IL     1210
 5 IA     1025
 6 FL      955
 7 MN      914
 8 MO      914
 9 NY      889
10 WI      807
# ℹ 42 more rows

Generate map visualizations.

# Load required data
world_map <- map_data("world")

# Map visualization for annular eclipse 2023
plot_annular_2023 <- ggplot() +
  geom_polygon(data = world_map, aes(x = long, y = lat, group = group), fill = "gray80", color = "gray60") +
  geom_point(data = eclipse_annular_2023, aes(x = lon, y = lat), color = "red") +
  labs(title = "Annular Eclipse 2023", size=12) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Map visualization for total eclipse 2024
plot_total_2024 <- ggplot() +
  geom_polygon(data = world_map, aes(x = long, y = lat, group = group), fill = "gray80", color = "gray60") +
  geom_point(data = eclipse_total_2024, aes(x = lon, y = lat), color = "blue") +
  labs(title = "Total Eclipse 2024", size=12) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Map visualization partial eclipse 2023
plot_partial_2023 <- ggplot() +
  geom_polygon(data = world_map, aes(x = long, y = lat, group = group), fill = "gray80", color = "gray60") +
  geom_point(data = eclipse_partial_2023, aes(x = lon, y = lat), color = "red") +
  labs(title = "Partial Eclipse 2023", size=12) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Map visualization for partial eclipse 2024
plot_partial_2024 <- ggplot() +
  geom_polygon(data = world_map, aes(x = long, y = lat, group = group), fill = "gray80", color = "gray60") +
  geom_point(data = eclipse_partial_2024, aes(x = lon, y = lat), color = "blue") +
  labs(title = "Partial Eclipse 2024", size=12) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Combine plots using patchwork
combined_plot <- (plot_annular_2023 +
  plot_total_2024 +
  plot_partial_2023 +
  plot_partial_2024 +
  plot_layout(nrow = 2, byrow = TRUE)) +
  plot_annotation((title="Eclipse Viewing Locations"),
  theme = theme(plot.title = element_text(hjust = 0.5, size = 16)))

# Display the combined plot
combined_plot

Question

Is it possible to predict the duration of an eclipse at any location based on its geographical position and the eclipse type?

Pre-Processing

Loading required libraries.

# Load required libraries
library(tidymodels)
library(yardstick)
library(xgboost)
library(hardhat)
library(glmnet)
library(ranger)
library(dplyr)
library(dials)

Processing data for modeling and calculating total duration.

# Prepare the data for modeling
eclipse_data <- bind_rows(
  mutate(eclipse_annular_2023, eclipse_type = "annular"),
  mutate(eclipse_total_2024, eclipse_type = "total")
)

# Function to convert time column to numeric format
convert_to_numeric <- function(x) {
  as.numeric(as.POSIXlt(x, format = "%H:%M:%S")$hour * 60 + as.POSIXlt(x, format = "%H:%M:%S")$min)
}

# Loop through eclipse columns to convert to numeric format
for (i in 1:6) { 
  eclipse_col <- paste0("eclipse_", i)
  eclipse_data[[eclipse_col]] <- convert_to_numeric(eclipse_data[[eclipse_col]])
}

# Calculate total duration
eclipse_data$total_duration <- eclipse_data$eclipse_6 - eclipse_data$eclipse_1

Creating a data split and recipie for model fitting.

# Separate the data into features and target variable
features <- eclipse_data %>%
  select(lat, lon, eclipse_type)
target <- eclipse_data$total_duration

# Create a data split
set.seed(042) # for reproducibility
data_split <- initial_split(eclipse_data, prop = 0.8)
train_data <- training(data_split)
test_data <- testing(data_split)

# Create a recipe for preprocessing the data
eclipse_recipe <- recipe(total_duration ~ lat + lon + eclipse_type, data = train_data) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_center(all_predictors(), -all_outcomes()) %>%
  step_scale(all_predictors(), -all_outcomes())

Model Fitting

Defining model specifications.

# Define model specifications...

# LASSO model
lasso_spec <- linear_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet") %>%
  set_mode("regression")

# Random forest model
rf_spec <- rand_forest(trees = 300, mtry = tune(), min_n = tune()) %>%
  set_engine("ranger", seed = 042) %>%
  set_mode("regression")

# Boosted trees model 
boost_spec <- boost_tree(trees = tune(), tree_depth = tune(), min_n = tune(), learn_rate = tune()) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

Creating workflows.

# Bundle the recipe and model specs into workflows...

# LASSO model
lasso_workflow <- workflow() %>%
  add_recipe(eclipse_recipe) %>%
  add_model(lasso_spec)

# Random forest model
rf_workflow <- workflow() %>%
  add_recipe(eclipse_recipe) %>%
  add_model(rf_spec)

# Boosted trees model
boost_workflow <- workflow() %>%
  add_recipe(eclipse_recipe) %>%
  add_model(boost_spec)

Setting up cross-validation.

# Setup for cross-validation
set.seed(042) # for reproducibility
cv_folds <- vfold_cv(train_data, v = 5, repeats = 3)

Fitting the LASSO model.

# -- [4a] LASSO Model ----------

# Set up a grid for the LASSO model
lasso_grid <- tibble(
  penalty = c(0.001, 0.01, 0.1, 0.5, 1)  
)

# Tune the LASSO model
lasso_tune_results <- tune_grid(
  lasso_workflow,
  resamples = cv_folds,
  grid = lasso_grid
)

# Select the best hyperparameters
best_lasso_params <- select_best(lasso_tune_results, "rmse")

# Update the LASSO specification with the best hyperparameters
lasso_spec_final <- lasso_spec %>%
  set_engine("glmnet") %>%
  set_mode("regression") %>%
  finalize_model(best_lasso_params)

# Fit the final LASSO model
lasso_fit_final <- workflow() %>%
  add_recipe(eclipse_recipe) %>%
  add_model(lasso_spec_final) %>%
  fit(data = train_data)

Fitting the Random Forest model.

# -- [4b] Random Forest Model ----------

# Create a tuning grid
rf_grid <- grid_regular(
  mtry(range = c(1, 3)),  
  min_n(range = c(1, 21)),
  levels = 7
)

# Tune the random forest model
rf_tune_results <- tune_grid(
  rf_workflow,
  resamples = cv_folds,
  grid = rf_grid
)

# Extract the best hyperparameters
best_rf_params <- select_best(rf_tune_results, "rmse")

# Update the Random Forest specification with the best hyperparameters
rf_spec_final <- rf_spec %>%
  finalize_model(best_rf_params) %>%
  set_engine("ranger") %>%
  set_mode("regression")

# Fit the final Random Forest model
rf_fit_final <- workflow() %>%
  add_recipe(eclipse_recipe) %>%
  add_model(rf_spec_final) %>%
  fit(data = train_data)

Fitting the Boosted Tree model.

# -- [4c] Boosted Trees Model ----------

# Define the parameters
boost_params <- parameters(
  tree_depth(range = c(1, 10)),
  learn_rate(range = c(0.01, 0.3)),
  trees(c(100, 500)),
  min_n(c(5, 20))  
)

# Create a Latin Hypercube Sampling grid
boost_grid <- grid_latin_hypercube(
  boost_params,
  size = 20
)

# Ensure 'cv' is correctly defined; assuming it should be 'cv_folds' used earlier
boost_tune_results <- tune_grid(
  boost_workflow,
  resamples = cv_folds, 
  grid = boost_grid
)

# Select the best hyperparameters
best_boost_params <- select_best(boost_tune_results, "rmse")

# Update the Boosted Trees specification with the best hyperparameters
boost_spec_final <- boost_spec %>%
  set_engine("xgboost") %>%
  set_mode("regression") %>%
  finalize_model(best_boost_params)

# Fit the final Boosted Trees model
boost_fit_final <- workflow() %>%
  add_recipe(eclipse_recipe) %>%
  add_model(boost_spec_final) %>%
  fit(data = train_data)

Model Evaluation

Loading required libraries.

# Load required libraries
library(knitr)

Creating model evaluation functions.

# Function to generate predictions ensuring consistent column naming
make_predictions <- function(model_fit, new_data) {
  predictions <- predict(model_fit, new_data)
  if (!"`.pred`" %in% names(predictions)) {
    predictions <- predictions %>%
      rename(.pred = .pred)  # Ensure this matches the output column name from `predict()`
  }
  predictions
}

# Function to evaluate model performance
evaluate_model <- function(predictions, actual_data, outcome_var) {
  results <- actual_data %>%
    select({{outcome_var}}) %>%
    bind_cols(predictions) %>%
    mutate(.resid = {{outcome_var}} - .pred,
           Resid_Sign = ifelse(.resid > 0, "Positive", "Negative"))
  
  metrics_results <- metrics(results, truth = {{outcome_var}}, estimate = .pred)
  rmse_results <- rmse(results, truth = {{outcome_var}}, estimate = .pred)
  rsq_results <- rsq(results, truth = {{outcome_var}}, estimate = .pred)
  
  # Enhanced plot with colors for positive and negative residuals
  residuals_plot <- ggplot(results, aes(x = .pred, y = .resid, color = Resid_Sign)) +
    geom_point() +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    scale_color_manual(values = c("Positive" = "blue", "Negative" = "red")) +
    labs(title = "Residuals vs. Predicted Values", x = "Predicted Values", y = "Residuals",
         color = "Residual Sign") +
    theme_minimal()
  
  list(
    Metrics = metrics_results,
    RMSE = rmse_results,
    R_Squared = rsq_results,
    Residuals_Plot = residuals_plot
  )
}

Evaluating models.

# Evaluate models
predictions_lasso <- make_predictions(lasso_fit_final, test_data)
eval_lasso <- evaluate_model(predictions_lasso, test_data, total_duration)

predictions_rf <- make_predictions(rf_fit_final, test_data)
eval_rf <- evaluate_model(predictions_rf, test_data, total_duration)

predictions_boost <- make_predictions(boost_fit_final, test_data)
eval_boost <- evaluate_model(predictions_boost, test_data, total_duration)

Print LASSO model evaluation.

# Print evaluation results for each model
cat("LASSO Model Evaluation:\n")

LASSO Model Evaluation:

print(eval_lasso$Metrics)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       1.72 
2 rsq     standard       0.974
3 mae     standard       1.35

print(eval_lasso$RMSE)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        1.72

print(eval_lasso$R_Squared)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard       0.974

eval_lasso$Residuals_Plot

Print Random Forest model evaluation.

cat("\nRandom Forest Model Evaluation:\n")


Random Forest Model Evaluation:

print(eval_rf$Metrics)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.367
2 rsq     standard       0.999
3 mae     standard       0.237

print(eval_rf$RMSE)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.367

print(eval_rf$R_Squared)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard       0.999

eval_rf$Residuals_Plot

Print Boosted Tree model evaluation.

cat("\nBoosted Trees Model Evaluation:\n")


Boosted Trees Model Evaluation:

print(eval_boost$Metrics)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.491
2 rsq     standard       0.998
3 mae     standard       0.370

print(eval_boost$RMSE)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.491

print(eval_boost$R_Squared)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard       0.998

eval_boost$Residuals_Plot

Extract RMSE and R-Squared values.

# Extract RMSE and R_Squared from the evaluation lists
rmse_lasso <- eval_lasso$RMSE$.estimate
rsq_lasso <- eval_lasso$R_Squared$.estimate

rmse_rf <- eval_rf$RMSE$.estimate
rsq_rf <- eval_rf$R_Squared$.estimate

rmse_boost <- eval_boost$RMSE$.estimate
rsq_boost <- eval_boost$R_Squared$.estimate

Create summary table.

# Create the summary table
metrics_summary <- tibble(
  Model = c("LASSO", "Random Forest", "Boosted Trees"),
  RMSE = c(rmse_lasso, rmse_rf, rmse_boost),
  R_Squared = c(rsq_lasso, rsq_rf, rsq_boost)
)

# Print a nicely formatted table for Markdown
kable(metrics_summary, caption = "Comparison of Model Performance Metrics")

Comparison of Model Performance Metrics
Model	RMSE	R_Squared
LASSO	1.7227196	0.9739039
Random Forest	0.3674444	0.9988143
Boosted Trees	0.4909033	0.9978832

#### Thoughts… The random forest model is the best choice out of the three. The RF model has the lowest RMSE score indicating a lower error rate, and a high R-squared value indicating that it explains the variation within the data. While it’s residuals graph isn’t perfect and still has some pattern to it, it is the most random of the three further indicating that it is the best choice.

Findings

Printing summary table.

# Print a nicely formatted table of model performance metrics
cat("Model Performance Summary:\n")

Model Performance Summary:

print(kable(metrics_summary, caption = "Comparison of Model Performance Metrics"))



Table: Comparison of Model Performance Metrics

|Model         |      RMSE| R_Squared|
|:-------------|---------:|---------:|
|LASSO         | 1.7227196| 0.9739039|
|Random Forest | 0.3674444| 0.9988143|
|Boosted Trees | 0.4909033| 0.9978832|

Generating map of predicted eclipse durations (RF model) vs actual eclipse durations.

# Generate predictions using the Random Forest model
predictions <- predict(rf_fit_final, test_data)
test_data$Predicted_Duration = predictions$.pred

# Prepare the data frame for mapping 
map_data <- test_data %>%
  select(lat, lon, total_duration, Predicted_Duration)

# Basic US map
us_map <- map_data("usa")

# Create the plot with aspect ratio fixed
ggplot() +
  geom_polygon(data = us_map, aes(x = long, y = lat, group = group), fill = "gray80", colour = "gray60") +
  geom_point(data = map_data, aes(x = lon, y = lat, color = "Actual Duration"), size = 2, alpha = 0.6) +
  geom_point(data = map_data, aes(x = lon, y = lat, color = "Predicted Duration"), size = 1, alpha = 0.6) +
  scale_color_manual(values = c("Actual Duration" = "blue", "Predicted Duration" = "red")) +
  labs(title = "Comparison of Actual and Predicted Eclipse Durations (US Region)",
       subtitle = "Blue = Actual Duration, Red = Predicted Duration",
       x = "Longitude", y = "Latitude") +
  theme_minimal() +
  theme(legend.position = "right")

Conclusion

The Random Forest model demonstrated the lowest RMSE and the highest R-squared among the tested models, indicating its strong predictive power. Based on the map of actual vs predicted eclipse durations, the RF model seems to be able to predict eclipse duration based on geographic location. This indicates that the answer to the initial question “Is it possible to predict the duration of an eclipse at any location based on its geographical position and the eclipse type?” is yes.