Introduction

The tidymodels framework is a collection of packages for modeling and machine learning using tidyverse principles. The tidyverse principles are a group of dogmatic but consistent practices for data organization and analysis. They are intended to minimize the time spent adjusting the experimental data for analysis, so called “data-wrangling” or “data-munging”. This standard also provides consistency in data analysis packages, as all tidy packages are designed expecting a specifically ordered “tidy” dataframe. The core of these packages is the “tidyverse” written by Hadley Wickham. There is an extended family of tidy packages written by third party developers following the tidy framework. While the core tidyverse packages focused on data manipulation, the “tidymodels” packages are designed to extended these principles into the modeling and analysis pipeline.

In the context of high throughput phenotyping, the tidymodels package is very useful for analyzing data collecting from image analysis programs. The framework also provides a useful way to learn machine learning and modeling workflows, which are used to train image analysis algorithms. We will demonstate how to use the tidymodels packages for plant phenotyping analysis using output datasets from other plant phenotyping workbooks.

There is a an accompanying powerpoint to this R Markdown document which explains the steps of this workflow in more detail.

Load Necessary Packages

#Load necessary packages
list.of.packages <- c("tidyverse", "tidymodels", "ranger", "kknn")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos = "https://cloud.r-project.org")
invisible(capture.output(lapply(list.of.packages, library, character.only = TRUE, warn.conflicts = FALSE, quietly = T)))
## -- Attaching packages --------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.1
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## -- Attaching packages -------------------------------------------------- tidymodels 0.1.1 --
## v broom     0.7.0      v recipes   0.1.13
## v dials     0.0.8      v rsample   0.0.7 
## v infer     0.5.3      v tune      0.1.1 
## v modeldata 0.0.2      v workflows 0.1.2 
## v parsnip   0.1.3      v yardstick 0.0.7
## -- Conflicts ----------------------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
rm(list = ls()) 

Pre-Processing

#set random seed for reproducability
set.seed(123)
#Read in data
plant_data <- readr::read_csv("pdata.csv") %>% #read in CSV file
  dplyr::select(-c("Trait", "myIndex")) %>% #select traits of interest
  dplyr::mutate(fertilize = ifelse(Plot%%2 ==0, TRUE, FALSE)) %>% #add boolean fertilizer variable
  dplyr::mutate(crop = factor(sample(c("maize", "soybean", "alfalfa"), size = nrow(.), replace = T))) %>% #assign plots different crops
  janitor::clean_names() #reduce column names to lowercase, remove punctuation
## Parsed with column specification:
## cols(
##   DAP = col_double(),
##   Name = col_character(),
##   Column = col_double(),
##   Row = col_double(),
##   Plot = col_double(),
##   Trait = col_double(),
##   NGRDI = col_double(),
##   BGI = col_double(),
##   GLI = col_double(),
##   VARI = col_double(),
##   myIndex = col_double(),
##   EPH = col_double()
## )
colnames(plant_data) #look at column names
##  [1] "dap"       "name"      "column"    "row"       "plot"      "ngrdi"    
##  [7] "bgi"       "gli"       "vari"      "eph"       "fertilize" "crop"
head(plant_data) #peek at data

Data Mapping

  • DAP - (Integer) Days After Planting
  • Column/Row - (Integer) Coordinate position of plant in field plot
  • Plot - (Integer) Plot number in which plant was grown
  • NGRDI - (Double) Normalized Difference Green/Red Index
  • BGI - (Double) Blue green Pigment Index
  • GLI - (Double) Green Leaf Index
  • VARI - (Double) Visible Atmospherically Resistant Index Green
  • EPH - (Double) Estimate Plant Height in meters
  • Fertilize - (Boolean) Fertilizer treatment applied
  • Crop - (Factor) crop type

Sampling

#split the dataset into a training and testing set
plant_split <- rsample::initial_split(plant_data, prop = 0.9) #create a split object where 9/10 proportion of data is used for training
plant_train <- rsample::testing(plant_split) #isolate training set into one object
plant_test <- rsample::training(plant_split) #isolate testing set into one object
tibble::glimpse(plant_test) #peek at plant testing set
## Rows: 631
## Columns: 12
## $ dap       <dbl> 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 4...
## $ name      <chr> "G51", "G92", "G37", "G96", "G35", "G13", "G3", "G58", "G...
## $ column    <dbl> 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 1, 2, 3, 4, 5...
## $ row       <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9,...
## $ plot      <dbl> 127, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 13...
## $ ngrdi     <dbl> 0.08519463, 0.11104453, 0.11680974, 0.10792150, 0.1055199...
## $ bgi       <dbl> 0.6222127, 0.6161263, 0.6080161, 0.6128025, 0.6307734, 0....
## $ gli       <dbl> 0.1544255, 0.1707747, 0.1767045, 0.1701369, 0.1627840, 0....
## $ vari      <dbl> 0.1282276, 0.1688281, 0.1770045, 0.1633977, 0.1617865, 0....
## $ eph       <dbl> 0.04168053, 0.06098519, 0.06159830, 0.06614864, 0.0546606...
## $ fertilize <lgl> FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE...
## $ crop      <fct> alfalfa, alfalfa, alfalfa, soybean, alfalfa, soybean, soy...

Pre-processing with Recipes

plant_recipe <- head(plant_train) %>%
  recipes::recipe(eph ~ ngrdi) 

#if the model formula is extremely long, it may be necessary to define the model formula using update_role functions
plant_recipe2 <- head(plant_train) %>%
  recipes::recipe() %>%
  recipes::update_role(eph, new_role = "outcome") %>% #
  recipes::update_role(ngrdi, new_role = "predictor") 

#"selector functions can be used to select a group of variables at one time
plant_recipe3 <- head(plant_train) %>%
  recipes::recipe() %>%
  recipes::update_role(matches("ep."), new_role = "outcome") %>%
  recipes::update_role(contains("N"), new_role = "predictor")

#you can also yse steps to transform variables with step functions (https://recipes.tidymodels.org/reference/index.html#section-step-functions-individual-transformations)
plant_recipe4 <- head(plant_train) %>%
  recipes::recipe(eph ~ ngrdi) %>%
  recipes::step_center(all_predictors(), -all_outcomes()) %>%
  recipes::step_scale(all_predictors(), -all_outcomes()) 


#execute the recipe on test data to validate
plant_recipe <- recipes::prep(plant_recipe)
#extract data where the recipe was applied 
plant_head <- recipes::juice(plant_recipe)
#apply recipe to data of interest
plant_train <- recipes::bake(plant_recipe, plant_train)
plant_test <- recipes::bake(plant_recipe, plant_test)
tibble::glimpse(plant_test)
## Rows: 631
## Columns: 2
## $ ngrdi <dbl> 0.08519463, 0.11104453, 0.11680974, 0.10792150, 0.10551998, 0...
## $ eph   <dbl> 0.04168053, 0.06098519, 0.06159830, 0.06614864, 0.05466063, 0...

Model Interface with Parsnip

#Define model
linear_lm <- linear_reg() %>% #define a linear regression model
  set_engine("lm") %>% #decide what package you want to use for the linear regression
  set_mode("regression")  #set regression or classification depending on model specifications

#alternative model arrangement for comparison
linear_glmnet <- linear_reg() %>%
  set_engine("stan") %>%
  set_mode("regression")

#note engines do not have to be R packages, tidymodels supports interfacing with outside programs such as keras or tensorflow. One of the benefits to using R.

Combing Recipes and Models with Workflows

#create a workflow
plant_workflow <- workflows::workflow() %>% #define workflow
  workflows::add_recipe(plant_recipe) %>% #add recipe specification
  workflows::add_model(linear_lm) #add model specification

#fit workflow to data
plant_fit <- parsnip::fit(plant_workflow, plant_train)

Prediction

#predict test data from fitting model
plant_pred <- predict(plant_fit, plant_test)
#bind prediction results to test data for easy comparison
plant_test <- dplyr::bind_cols(plant_test, plant_pred)
tibble::glimpse(plant_test)
## Rows: 631
## Columns: 3
## $ ngrdi <dbl> 0.08519463, 0.11104453, 0.11680974, 0.10792150, 0.10551998, 0...
## $ eph   <dbl> 0.04168053, 0.06098519, 0.06159830, 0.06614864, 0.05466063, 0...
## $ .pred <dbl> 0.10443734, 0.15757514, 0.16942625, 0.15115535, 0.14621872, 0...

Metrics with yardstick

#calculate model performance metrics (default)
plant_metrics <- yardstick::metrics(plant_test, truth = eph, estimate = .pred)
plant_metrics
#calculate a single model performance metric
yardstick::rmse(data = plant_test, truth = eph, estimate = .pred)
#calculate a custom set of model performance metrics
plant_multi <- yardstick::metric_set(rmse, rsq)
plant_multi(data = plant_test, truth = eph, estimate = .pred)

k-Fold Cross Validation with rsample and tune

#Split training data for cross validation scheme into 5 folds with 5 repeats
plant_folds <- rsample::vfold_cv(training(plant_split), v = 5, repeats = 5)
#define a control object to change settings in the cross validation process
plant_control <- tune::control_grid(
  verbose = FALSE,  #print out progress while fitting
  allow_par = TRUE, #allow parallel processing
  extract = function(x){extract_model(x)},  #extract the individual fitting model object for each split
  save_pred = TRUE #save model predictions
  )

#fit the workflow to the folds object, with control settings
plant_cv <- tune::fit_resamples(plant_workflow, plant_folds, control = plant_control)
#collect performance metrics from the folds
plant_metrics <- tune::collect_metrics(plant_cv)
#collect predictions from the folds
plant_predictions <- tune::collect_predictions(plant_cv)

Memory efficiency of rset objects

#set seed to random integer
set.seed(Sys.time())
#create list to store folds
custom_folds <- list()
#create 25 splits of the training data 
for(i in 1:25){
  custom_folds[[i]] <- initial_split(training(plant_split), prop = 0.8)
}
#convert list folds to tibble
custom_folds <- tibble(custom_folds)

#compare object size
pryr::object_size(plant_folds)
## Registered S3 method overwritten by 'pryr':
##   method      from
##   print.bytes Rcpp
## 137 kB
pryr::object_size(custom_folds)
## 1.5 MB

Model Tuning with Random Forest Model

#Definee a new recipe for random forest model 
rf_recipe <- head(training(plant_split)) %>%
  recipes::recipe(crop ~ .) %>%
  recipes::update_role(name, new_role = "id") %>%
  recipes::step_string2factor(crop) 

#Define a new model 
rf_tune <- parsnip::rand_forest(trees = 100, mtry = tune(), min_n = tune()) %>% #mtry and min_n parameters are set to "tune()"
  parsnip::set_engine("ranger") %>% #use ranger package
  parsnip::set_mode("classification") #set to classification

#define new workflow with recipe and model
tune_wf <- workflows::workflow() %>%
  workflows::add_recipe(rf_recipe) %>%
  workflows::add_model(rf_tune)

#tune the models
tune_res <- tune::tune_grid(tune_wf, resamples = plant_folds, grid = 5)
## i Creating pre-processing data to finalize unknown parameter: mtry
#collect metrics for tuning
tune_metrics <- tune::collect_metrics(tune_res)

#reshape data from wide to long format for plotting
p1data <- tune_metrics %>%
  tidyr::pivot_longer(min_n:mtry, values_to = "value",names_to = "parameter")

#plot data
p1 <- p1data %>%  
  ggplot(aes(x = value, y = mean)) + 
  geom_point() + 
  facet_wrap(parameter~.metric, scales = "free")
p1

### Model Selection

#select the best parameters from tuning routine
best_params <- tune::select_best(tune_res, "roc_auc")
#final model using parameters selected from tune results
final_rf <- tune::finalize_model(rf_tune, best_params)

#add final model into workflow
final_wf <- workflow() %>%
  add_recipe(rf_recipe) %>%
  add_model(final_rf)

#fit the finalized model to the testing data 
final_res <- final_wf %>% tune::last_fit(plant_split)
collect_metrics(final_res)

Additional Example with PCA and KNN

#Create Sample data
big_mat <- as.data.frame(matrix(rexp(20000, rate=2), ncol=200))
big_mat$var <- as.factor(sample(c(TRUE, FALSE), nrow(big_mat), replace = TRUE))

#Define PCA Recipe
recipe <- head(big_mat) %>%
  recipes::recipe(var ~ .) %>%
  recipes::step_center(all_predictors()) %>%
  recipes::step_scale(all_predictors()) %>%
  step_pca(all_predictors(), num_comp = 2)

#Define KNN model
model <- nearest_neighbor(neighbors = 5, weight_func = "rectangular") %>%
  set_engine("kknn") %>%
  set_mode("classification") 

#Create workflow
workflow <- workflow() %>%
  add_recipe(recipe) %>%
  add_model(model)

#fit model
fit <- fit(workflow, big_mat)
#model predictions
preds <- predict(fit, big_mat)
#bind model predictions to original data
res <- cbind(big_mat, preds) %>%
  select(var, .pred_class) 

#calculate performance metrics
res_metrics <- metrics(res, truth = var, estimate = .pred_class)