library(tidyverse)
# find data at https://doi.org/10.1002/agj2.20812
<- readxl::read_xlsx("data/8.Yield_Plant_Measurements.xlsx", sheet = 2) %>%
dd_yield filter(Year == 2014,
== "IA",
State == "Ames") %>%
Site mutate(across(VT_TissN:R4N, ~ as.numeric(.)))
<- readxl::read_xlsx("data/1.Site_Characterization.xlsx", sheet = 2) %>%
dd_soil filter(Year == 2014,
== "IA",
State == "Ames",
Site == "Ap") %>%
Horizon mutate(Block = as.numeric(Block),
across(Clay:WC, ~as.numeric(.)))
<- readxl::read_xlsx("data/4.SoilN.xlsx", sheet = 2) %>%
dd_N filter(Year == 2014,
== "IA",
State == "Ames",
Site %in% dd_yield$Plot_ID,
Plot_ID == "Post") %>%
Sam_Time mutate(across(c(Plot_ID, N_Trt, Plant_N, Side_N, Plant_N_SI, Side_N_SI), ~as.numeric(.)))
<- dd_yield %>%
dd_complete left_join(dd_soil) %>%
left_join(dd_N)
<- dd_complete %>%
dd_complete rownames_to_column("id") %>%
mutate(gyield_14 = GYdry/.94,
N_total = Plant_N_SI + Side_N_SI,
sy = paste0(Site," (", Year, ")"),
sy_f = as.numeric(factor(sy)),
id = as.numeric(id))
Day 22 - 10/09/2024
Announcements
- A really quick Kahoot
- Today’s office hours are 1-2pm.
Model selection
- How many variables should we include in our model?
- Bias-variance trade off (show on whiteboard)
- What defines the “goodness” of a model?
- Penalized fit (i.e., penalty for including too many parameters)
- Out-of-sample prediction accuracy
- Validation set
- Cross-validation (k-fold/leave-one-out)
- Validation set
- A paper on how to evaluate predictions versus observed values.
- Penalized fit (i.e., penalty for including too many parameters)
Variable selection based on out-of-sample prediction accuracy
\[y\sim N(\mu, \sigma^2)\]
\[\mu = \mathbf{x}'\boldsymbol{\beta}\]
\[\hat{\mu}_{new} = \mathbf{x}_{new}'\hat{\boldsymbol{\beta}}\]
Under the out-of-sample prediction accuracy criterion, we evaluate how the models estimate \(\mu_{new}\) by checking how far the \(\hat{\mu}_{new}\) estimate is from the observed (yet not used to fit the model) \(y_{new}\).
Load the data
%>%
dd_complete ggplot(aes(N_total, gyield_14))+
geom_point()+
facet_wrap(~sy)+
coord_cartesian(ylim = c(0, 15000))+
theme_classic()+
labs(x = expression(Total~N~Applied~(kg~ha^{-1})),
y = expression(Grain~Yield~(kg~ha^{-1})))
Validation set approach
<- unique(dd_complete$id)
id <- length(id)
n set.seed(54)
<- sample(id, size = n*.6, replace = FALSE)
subset_fit
<- dd_complete %>%
dd_complete_fit filter(id %in% subset_fit)
<- dd_complete %>%
dd_complete_test filter(!id %in% subset_fit)
<- lm(gyield_14 ~ N_total + I(N_total^2), data = dd_complete_fit)
m1 <- lm(gyield_14 ~ N_total + I(N_total^2) + Block , data = dd_complete_fit)
m2 <- lm(gyield_14 ~ N_total + I(N_total^2) + Block + PMPM, data = dd_complete_fit)
m3 <- lm(gyield_14 ~ N_total + I(N_total^2) + Block + Sand, data = dd_complete_fit)
m4 <- lm(gyield_14 ~ N_total + I(N_total^2) + Block + Sand + Clay, data = dd_complete_fit)
m5 <- lm(gyield_14 ~ N_total + I(N_total^2) + Block + Sand + Clay + Silt, data = dd_complete_fit) m6
$pred_m1 <- predict(m1, newdata = dd_complete_test)
dd_complete_test$pred_m2 <- predict(m2, newdata = dd_complete_test)
dd_complete_test$pred_m3 <- predict(m3, newdata = dd_complete_test)
dd_complete_test$pred_m4 <- predict(m4, newdata = dd_complete_test)
dd_complete_test$pred_m5 <- predict(m5, newdata = dd_complete_test)
dd_complete_test$pred_m6 <- predict(m6, newdata = dd_complete_test)
dd_complete_test
%>%
dd_complete_test summarise(rmse_m1 = sqrt(mean((gyield_14 - pred_m1)^2)),
rmse_m2 = sqrt(mean((gyield_14 - pred_m2)^2)),
rmse_m3 = sqrt(mean((gyield_14 - pred_m3)^2)),
rmse_m4 = sqrt(mean((gyield_14 - pred_m4)^2)),
rmse_m5 = sqrt(mean((gyield_14 - pred_m5)^2)),
rmse_m6 = sqrt(mean((gyield_14 - pred_m6)^2)))
# A tibble: 1 × 6
rmse_m1 rmse_m2 rmse_m3 rmse_m4 rmse_m5 rmse_m6
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 637. 627. 618. 640. 648. 648.
%>%
dd_complete_test pivot_longer(cols = pred_m1:pred_m6) %>%
mutate(name = str_replace(name, "pred_", "")) %>%
ggplot(aes(gyield_14, value))+
geom_point()+
labs(x = expression(Observed~(Mg~ha^{-1})),
y = expression(Predicted~(Mg~ha^{-1})))+
facet_wrap(~name)+
geom_abline(slope = 1)+
theme_classic()
Cross-validation
- \(k\)-fold cross validation
- leave-one-out cross validation (\(n\)-fold cross validation)
library(caret)
::train(gyield_14 ~ N_total + I(N_total^2), data = dd_complete, method = "lm",
carettrControl = trainControl(method = "cv", number = 10))
Linear Regression
64 samples
1 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 56, 57, 56, 59, 58, 57, ...
Resampling results:
RMSE Rsquared MAE
542.9698 0.7949004 452.1508
Tuning parameter 'intercept' was held constant at a value of TRUE
::train(gyield_14 ~ N_total + I(N_total^2), data = dd_complete, method = "lm",
carettrControl = trainControl(method = "LOOCV"))
Linear Regression
64 samples
1 predictor
No pre-processing
Resampling: Leave-One-Out Cross-Validation
Summary of sample sizes: 63, 63, 63, 63, 63, 63, ...
Resampling results:
RMSE Rsquared MAE
594.5187 0.8525543 472.4274
Tuning parameter 'intercept' was held constant at a value of TRUE
<- caret::train(gyield_14 ~ N_total + I(N_total^2), data = dd_complete, method = "lm",
kcv_m1 trControl = trainControl(method = "cv", number = 10))
<- caret::train(gyield_14 ~ N_total + I(N_total^2), data = dd_complete, method = "lm",
loocv_m1 trControl = trainControl(method = "LOOCV"))
<- caret::train(gyield_14 ~ N_total + I(N_total^2) + Block + Sand, data = dd_complete, method = "lm",
kcv_m4 trControl = trainControl(method = "cv", number = 10))
<- caret::train(gyield_14 ~ N_total + I(N_total^2) + Block + Sand, data = dd_complete, method = "lm",
loocv_m4 trControl = trainControl(method = "LOOCV"))
<- caret::train(gyield_14 ~ N_total + I(N_total^2) + Block + Sand + Clay, data = dd_complete, method = "lm",
kcv_m5 trControl = trainControl(method = "cv", number = 10))
<- caret::train(gyield_14 ~ N_total + I(N_total^2) + Block + Sand + Clay, data = dd_complete, method = "lm",
loocv_m5 trControl = trainControl(method = "LOOCV"))
<- caret::train(gyield_14 ~ N_total + I(N_total^2) + Block + Sand + Clay + Silt,
kcv_m6 data = dd_complete, method = "lm",
trControl = trainControl(method = "cv", number = 10))
<- caret::train(gyield_14 ~ N_total + I(N_total^2) + Block + Clay + Sand + Silt,
loocv_m6 data = dd_complete, method = "lm",
trControl = trainControl(method = "LOOCV"))
<- data.frame(model = c("N_total + I(N_total^2)",
metrics "N_total + I(N_total^2) + Block",
"N_total + I(N_total^2) + Block + PMPM",
"N_total + I(N_total^2) + Block + Sand",
"N_total + I(N_total^2) + Block + Sand + Clay",
"N_total + I(N_total^2) + Block + Sand + Clay + Silt"),
R2 = c(summary(m1)$r.squared, summary(m2)$r.squared,
summary(m3)$r.squared, summary(m4)$r.squared, summary(m5)$r.squared, summary(m6)$r.squared),
R2_adj = c(summary(m1)$adj.r.squared, summary(m2)$adj.r.squared,
summary(m3)$adj.r.squared, summary(m4)$adj.r.squared, summary(m5)$adj.r.squared,
summary(m6)$adj.r.squared),
AIC = AIC(m1, m2, m3, m4, m5, m6)$AIC,
BIC = BIC(m1, m2, m3, m4, m5, m6)$BIC,
`RMSE.kCV` = c(kcv_m1$results$RMSE, NA, NA,
$results$RMSE, kcv_m5$results$RMSE, kcv_m6$results$RMSE),
kcv_m4`RMSESD.kCV` = c(kcv_m1$results$RMSESD, NA, NA,
$results$RMSESD, kcv_m5$results$RMSESD, kcv_m6$results$RMSESD),
kcv_m4`R2.kCV` = c(kcv_m1$results$Rsquared, NA, NA,
$results$Rsquared, kcv_m5$results$Rsquared, kcv_m6$results$Rsquared),
kcv_m4`R2SD.kCV` = c(kcv_m1$results$RsquaredSD, NA, NA,
$results$RsquaredSD, kcv_m5$results$RsquaredSD, kcv_m6$results$RsquaredSD),
kcv_m4`RMSE.looCV` = c(loocv_m1$results$RMSE, NA, NA,
$results$RMSE, loocv_m5$results$RMSE, loocv_m6$results$RMSE),
loocv_m4`R2.looCV` = c(loocv_m1$results$Rsquared, NA, NA,
$results$Rsquared, loocv_m5$results$Rsquared, loocv_m6$results$Rsquared) ) %>%
loocv_m4mutate(across(R2:`R2.looCV`, ~round(., 2)))
::kable(metrics, format = "html") knitr
model | R2 | R2_adj | AIC | BIC | RMSE.kCV | RMSESD.kCV | R2.kCV | R2SD.kCV | RMSE.looCV | R2.looCV |
---|---|---|---|---|---|---|---|---|---|---|
N_total + I(N_total^2) | 0.84 | 0.83 | 586.91 | 593.46 | 576.23 | 162.27 | 0.81 | 0.20 | 594.52 | 0.85 |
N_total + I(N_total^2) + Block | 0.84 | 0.82 | 588.69 | 596.87 | NA | NA | NA | NA | NA | NA |
N_total + I(N_total^2) + Block + PMPM | 0.84 | 0.82 | 589.36 | 599.18 | NA | NA | NA | NA | NA | NA |
N_total + I(N_total^2) + Block + Sand | 0.84 | 0.82 | 589.59 | 599.41 | 585.76 | 152.05 | 0.80 | 0.27 | 599.30 | 0.85 |
N_total + I(N_total^2) + Block + Sand + Clay | 0.85 | 0.83 | 588.66 | 600.13 | 575.84 | 186.71 | 0.80 | 0.14 | 599.10 | 0.85 |
N_total + I(N_total^2) + Block + Sand + Clay + Silt | 0.85 | 0.83 | 588.66 | 600.13 | 588.35 | 138.19 | 0.79 | 0.24 | 599.10 | 0.85 |
Data example 2
<- readxl::read_xlsx("data/8.Yield_Plant_Measurements.xlsx", sheet = 2) %>%
dd_yield filter(State == "IA",) %>%
mutate(across(VT_TissN:R4N, ~ as.numeric(.)))
<- readxl::read_xlsx("data/1.Site_Characterization.xlsx", sheet = 2) %>%
dd_soil filter(State == "IA",
== "Ap") %>%
Horizon mutate(Block = as.numeric(Block),
across(Clay:WC, ~as.numeric(.)))
<- readxl::read_xlsx("data/4.SoilN.xlsx", sheet = 2) %>%
dd_N filter(State == "IA",
%in% dd_yield$Plot_ID,
Plot_ID == "Post") %>%
Sam_Time mutate(across(c(Plot_ID, N_Trt, Plant_N, Side_N, Plant_N_SI, Side_N_SI), ~as.numeric(.)))
<- dd_yield %>%
dd_complete left_join(dd_soil) %>%
left_join(dd_N)
<- dd_complete %>%
dd_complete rownames_to_column("id") %>%
mutate(gyield_14 = GYdry/.94,
N_total = Plant_N_SI + Side_N_SI,
sy = paste0(Site," (", Year, ")"),
sy_f = as.numeric(factor(sy)),
id = as.numeric(id))
%>%
dd_complete ggplot(aes(N_total, gyield_14))+
geom_point()+
facet_wrap(~sy)+
coord_cartesian(ylim = c(0, 15000))+
theme_classic()+
labs(x = expression(Total~N~Applied~(kg~ha^{-1})),
y = expression(Grain~Yield~(kg~ha^{-1})))
- How to perform cross-validation in this case
- Dependence structures
- Paper “Cross-validation strategies for data with temporal, spatial, hierarchical, or phylogenetic structure”
For next class
- Assignment 3 is due tomorrow midnight.
- Last few hours to chip in your feedback in the survey.