The tidymodels framework is a collection of packages for modeling and machine learning using tidyverse principles. The tidyverse principles are a group of dogmatic but consistent practices for data organization and analysis. They are intended to minimize the time spent adjusting the experimental data for analysis, so called “data-wrangling” or “data-munging”. This standard also provides consistency in data analysis packages, as all tidy packages are designed expecting a specifically ordered “tidy” dataframe. The core of these packages is the “tidyverse” written by Hadley Wickham. There is an extended family of tidy packages written by third party developers following the tidy framework. While the core tidyverse packages focused on data manipulation, the “tidymodels” packages are designed to extended these principles into the modeling and analysis pipeline.
In the context of high throughput phenotyping, the tidymodels package is very useful for analyzing data collecting from image analysis programs. The framework also provides a useful way to learn machine learning and modeling workflows, which are used to train image analysis algorithms. We will demonstate how to use the tidymodels packages for plant phenotyping analysis using output datasets from other plant phenotyping workbooks.
There is a an accompanying powerpoint to this R Markdown document which explains the steps of this workflow in more detail.
#Load necessary packages
list.of.packages <- c("tidyverse", "tidymodels", "ranger", "kknn")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages, repos = "https://cloud.r-project.org")
invisible(capture.output(lapply(list.of.packages, library, character.only = TRUE, warn.conflicts = FALSE, quietly = T)))
## -- Attaching packages --------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.1
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## -- Attaching packages -------------------------------------------------- tidymodels 0.1.1 --
## v broom 0.7.0 v recipes 0.1.13
## v dials 0.0.8 v rsample 0.0.7
## v infer 0.5.3 v tune 0.1.1
## v modeldata 0.0.2 v workflows 0.1.2
## v parsnip 0.1.3 v yardstick 0.0.7
## -- Conflicts ----------------------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
rm(list = ls())
#set random seed for reproducability
set.seed(123)
#Read in data
plant_data <- readr::read_csv("pdata.csv") %>% #read in CSV file
dplyr::select(-c("Trait", "myIndex")) %>% #select traits of interest
dplyr::mutate(fertilize = ifelse(Plot%%2 ==0, TRUE, FALSE)) %>% #add boolean fertilizer variable
dplyr::mutate(crop = factor(sample(c("maize", "soybean", "alfalfa"), size = nrow(.), replace = T))) %>% #assign plots different crops
janitor::clean_names() #reduce column names to lowercase, remove punctuation
## Parsed with column specification:
## cols(
## DAP = col_double(),
## Name = col_character(),
## Column = col_double(),
## Row = col_double(),
## Plot = col_double(),
## Trait = col_double(),
## NGRDI = col_double(),
## BGI = col_double(),
## GLI = col_double(),
## VARI = col_double(),
## myIndex = col_double(),
## EPH = col_double()
## )
colnames(plant_data) #look at column names
## [1] "dap" "name" "column" "row" "plot" "ngrdi"
## [7] "bgi" "gli" "vari" "eph" "fertilize" "crop"
head(plant_data) #peek at data
#split the dataset into a training and testing set
plant_split <- rsample::initial_split(plant_data, prop = 0.9) #create a split object where 9/10 proportion of data is used for training
plant_train <- rsample::testing(plant_split) #isolate training set into one object
plant_test <- rsample::training(plant_split) #isolate testing set into one object
tibble::glimpse(plant_test) #peek at plant testing set
## Rows: 631
## Columns: 12
## $ dap <dbl> 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 4...
## $ name <chr> "G51", "G92", "G37", "G96", "G35", "G13", "G3", "G58", "G...
## $ column <dbl> 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 1, 2, 3, 4, 5...
## $ row <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9,...
## $ plot <dbl> 127, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 13...
## $ ngrdi <dbl> 0.08519463, 0.11104453, 0.11680974, 0.10792150, 0.1055199...
## $ bgi <dbl> 0.6222127, 0.6161263, 0.6080161, 0.6128025, 0.6307734, 0....
## $ gli <dbl> 0.1544255, 0.1707747, 0.1767045, 0.1701369, 0.1627840, 0....
## $ vari <dbl> 0.1282276, 0.1688281, 0.1770045, 0.1633977, 0.1617865, 0....
## $ eph <dbl> 0.04168053, 0.06098519, 0.06159830, 0.06614864, 0.0546606...
## $ fertilize <lgl> FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE...
## $ crop <fct> alfalfa, alfalfa, alfalfa, soybean, alfalfa, soybean, soy...
plant_recipe <- head(plant_train) %>%
recipes::recipe(eph ~ ngrdi)
#if the model formula is extremely long, it may be necessary to define the model formula using update_role functions
plant_recipe2 <- head(plant_train) %>%
recipes::recipe() %>%
recipes::update_role(eph, new_role = "outcome") %>% #
recipes::update_role(ngrdi, new_role = "predictor")
#"selector functions can be used to select a group of variables at one time
plant_recipe3 <- head(plant_train) %>%
recipes::recipe() %>%
recipes::update_role(matches("ep."), new_role = "outcome") %>%
recipes::update_role(contains("N"), new_role = "predictor")
#you can also yse steps to transform variables with step functions (https://recipes.tidymodels.org/reference/index.html#section-step-functions-individual-transformations)
plant_recipe4 <- head(plant_train) %>%
recipes::recipe(eph ~ ngrdi) %>%
recipes::step_center(all_predictors(), -all_outcomes()) %>%
recipes::step_scale(all_predictors(), -all_outcomes())
#execute the recipe on test data to validate
plant_recipe <- recipes::prep(plant_recipe)
#extract data where the recipe was applied
plant_head <- recipes::juice(plant_recipe)
#apply recipe to data of interest
plant_train <- recipes::bake(plant_recipe, plant_train)
plant_test <- recipes::bake(plant_recipe, plant_test)
tibble::glimpse(plant_test)
## Rows: 631
## Columns: 2
## $ ngrdi <dbl> 0.08519463, 0.11104453, 0.11680974, 0.10792150, 0.10551998, 0...
## $ eph <dbl> 0.04168053, 0.06098519, 0.06159830, 0.06614864, 0.05466063, 0...
#Define model
linear_lm <- linear_reg() %>% #define a linear regression model
set_engine("lm") %>% #decide what package you want to use for the linear regression
set_mode("regression") #set regression or classification depending on model specifications
#alternative model arrangement for comparison
linear_glmnet <- linear_reg() %>%
set_engine("stan") %>%
set_mode("regression")
#note engines do not have to be R packages, tidymodels supports interfacing with outside programs such as keras or tensorflow. One of the benefits to using R.
#create a workflow
plant_workflow <- workflows::workflow() %>% #define workflow
workflows::add_recipe(plant_recipe) %>% #add recipe specification
workflows::add_model(linear_lm) #add model specification
#fit workflow to data
plant_fit <- parsnip::fit(plant_workflow, plant_train)
#predict test data from fitting model
plant_pred <- predict(plant_fit, plant_test)
#bind prediction results to test data for easy comparison
plant_test <- dplyr::bind_cols(plant_test, plant_pred)
tibble::glimpse(plant_test)
## Rows: 631
## Columns: 3
## $ ngrdi <dbl> 0.08519463, 0.11104453, 0.11680974, 0.10792150, 0.10551998, 0...
## $ eph <dbl> 0.04168053, 0.06098519, 0.06159830, 0.06614864, 0.05466063, 0...
## $ .pred <dbl> 0.10443734, 0.15757514, 0.16942625, 0.15115535, 0.14621872, 0...
#calculate model performance metrics (default)
plant_metrics <- yardstick::metrics(plant_test, truth = eph, estimate = .pred)
plant_metrics
#calculate a single model performance metric
yardstick::rmse(data = plant_test, truth = eph, estimate = .pred)
#calculate a custom set of model performance metrics
plant_multi <- yardstick::metric_set(rmse, rsq)
plant_multi(data = plant_test, truth = eph, estimate = .pred)
#Split training data for cross validation scheme into 5 folds with 5 repeats
plant_folds <- rsample::vfold_cv(training(plant_split), v = 5, repeats = 5)
#define a control object to change settings in the cross validation process
plant_control <- tune::control_grid(
verbose = FALSE, #print out progress while fitting
allow_par = TRUE, #allow parallel processing
extract = function(x){extract_model(x)}, #extract the individual fitting model object for each split
save_pred = TRUE #save model predictions
)
#fit the workflow to the folds object, with control settings
plant_cv <- tune::fit_resamples(plant_workflow, plant_folds, control = plant_control)
#collect performance metrics from the folds
plant_metrics <- tune::collect_metrics(plant_cv)
#collect predictions from the folds
plant_predictions <- tune::collect_predictions(plant_cv)
#set seed to random integer
set.seed(Sys.time())
#create list to store folds
custom_folds <- list()
#create 25 splits of the training data
for(i in 1:25){
custom_folds[[i]] <- initial_split(training(plant_split), prop = 0.8)
}
#convert list folds to tibble
custom_folds <- tibble(custom_folds)
#compare object size
pryr::object_size(plant_folds)
## Registered S3 method overwritten by 'pryr':
## method from
## print.bytes Rcpp
## 137 kB
pryr::object_size(custom_folds)
## 1.5 MB
#Definee a new recipe for random forest model
rf_recipe <- head(training(plant_split)) %>%
recipes::recipe(crop ~ .) %>%
recipes::update_role(name, new_role = "id") %>%
recipes::step_string2factor(crop)
#Define a new model
rf_tune <- parsnip::rand_forest(trees = 100, mtry = tune(), min_n = tune()) %>% #mtry and min_n parameters are set to "tune()"
parsnip::set_engine("ranger") %>% #use ranger package
parsnip::set_mode("classification") #set to classification
#define new workflow with recipe and model
tune_wf <- workflows::workflow() %>%
workflows::add_recipe(rf_recipe) %>%
workflows::add_model(rf_tune)
#tune the models
tune_res <- tune::tune_grid(tune_wf, resamples = plant_folds, grid = 5)
## i Creating pre-processing data to finalize unknown parameter: mtry
#collect metrics for tuning
tune_metrics <- tune::collect_metrics(tune_res)
#reshape data from wide to long format for plotting
p1data <- tune_metrics %>%
tidyr::pivot_longer(min_n:mtry, values_to = "value",names_to = "parameter")
#plot data
p1 <- p1data %>%
ggplot(aes(x = value, y = mean)) +
geom_point() +
facet_wrap(parameter~.metric, scales = "free")
p1
### Model Selection
#select the best parameters from tuning routine
best_params <- tune::select_best(tune_res, "roc_auc")
#final model using parameters selected from tune results
final_rf <- tune::finalize_model(rf_tune, best_params)
#add final model into workflow
final_wf <- workflow() %>%
add_recipe(rf_recipe) %>%
add_model(final_rf)
#fit the finalized model to the testing data
final_res <- final_wf %>% tune::last_fit(plant_split)
collect_metrics(final_res)
#Create Sample data
big_mat <- as.data.frame(matrix(rexp(20000, rate=2), ncol=200))
big_mat$var <- as.factor(sample(c(TRUE, FALSE), nrow(big_mat), replace = TRUE))
#Define PCA Recipe
recipe <- head(big_mat) %>%
recipes::recipe(var ~ .) %>%
recipes::step_center(all_predictors()) %>%
recipes::step_scale(all_predictors()) %>%
step_pca(all_predictors(), num_comp = 2)
#Define KNN model
model <- nearest_neighbor(neighbors = 5, weight_func = "rectangular") %>%
set_engine("kknn") %>%
set_mode("classification")
#Create workflow
workflow <- workflow() %>%
add_recipe(recipe) %>%
add_model(model)
#fit model
fit <- fit(workflow, big_mat)
#model predictions
preds <- predict(fit, big_mat)
#bind model predictions to original data
res <- cbind(big_mat, preds) %>%
select(var, .pred_class)
#calculate performance metrics
res_metrics <- metrics(res, truth = var, estimate = .pred_class)