diff --git a/data-raw/EP_functions.R b/data-raw/EP_functions.R deleted file mode 100644 index 9b44d3c7..00000000 --- a/data-raw/EP_functions.R +++ /dev/null @@ -1,165 +0,0 @@ - -## Helper functions - -#All of these are heavily borrowed from nflscrapR (Maksim Horowitz, Ronald Yurko, and Samuel Ventura) - -#original code -#https://github.com/ryurko/nflscrapR-models/blob/master/R/init_models/init_ep_fg_models.R - -######################################################################## -#### helper function for building dataset to estimate EPA model ######## -######################################################################## -#ben is reasonably confident that this works and is finished -#this is only needed to estimate the EP model, not actually adding EP to data -find_game_next_score_half <- function(pbp_dataset) { - - # Which rows are the scoring plays: - score_plays <- which(pbp_dataset$sp == 1 & pbp_dataset$play_type != "no_play") - - # Define a helper function that takes in the current play index, - # a vector of the scoring play indices, play-by-play data, - # and returns the score type and drive number for the next score: - find_next_score <- function(play_i, score_plays_i,pbp_df) { - - # Find the next score index for the current play - # based on being the first next score index: - next_score_i <- score_plays_i[which(score_plays_i >= play_i)[1]] - - # If next_score_i is NA (no more scores after current play) - # or if the next score is in another half, - # then return No_Score and the current drive number - if (is.na(next_score_i) | - (pbp_df$qtr[play_i] %in% c(1, 2) & pbp_df$qtr[next_score_i] %in% c(3, 4, 5)) | - (pbp_df$qtr[play_i] %in% c(3, 4) & pbp_df$qtr[next_score_i] == 5)) { - - score_type <- "No_Score" - - # Make it the current play index - score_drive <- pbp_df$drive[play_i] - - # Else return the observed next score type and drive number: - } else { - - # Store the score_drive number - score_drive <- pbp_df$drive[next_score_i] - - # Then check the play types to decide what to return - # based on several types of cases for the next score: - - # 1: Return TD - if (pbp_df$touchdown[next_score_i] == 1 & (pbp_df$td_team[next_score_i] != pbp_df$posteam[next_score_i])) { - - # For return touchdowns the current posteam would not have - # possession at the time of return, so it's flipped: - if (identical(pbp_df$posteam[play_i], pbp_df$posteam[next_score_i])) { - - score_type <- "Opp_Touchdown" - - } else { - - score_type <- "Touchdown" - - } - } else if (identical(pbp_df$field_goal_result[next_score_i], "made")) { - - # 2: Field Goal - # Current posteam made FG - if (identical(pbp_df$posteam[play_i], pbp_df$posteam[next_score_i])) { - - score_type <- "Field_Goal" - - # Opponent made FG - } else { - - score_type <- "Opp_Field_Goal" - - } - - # 3: Touchdown (returns already counted for) - } else if (pbp_df$touchdown[next_score_i] == 1) { - - # Current posteam TD - if (identical(pbp_df$posteam[play_i], pbp_df$posteam[next_score_i])) { - - score_type <- "Touchdown" - - # Opponent TD - } else { - - score_type <- "Opp_Touchdown" - - } - # 4: Safety (similar to returns) - } else if (pbp_df$safety[next_score_i] == 1) { - - if (identical(pbp_df$posteam[play_i],pbp_df$posteam[next_score_i])) { - - score_type <- "Opp_Safety" - - } else { - - score_type <- "Safety" - - } - # 5: Extra Points - } else if (identical(pbp_df$extra_point_result[next_score_i], "good")) { - - # Current posteam Extra Point - if (identical(pbp_df$posteam[play_i], pbp_df$posteam[next_score_i])) { - - score_type <- "Extra_Point" - - # Opponent Extra Point - } else { - - score_type <- "Opp_Extra_Point" - - } - # 6: Two Point Conversions - } else if (identical(pbp_df$two_point_conv_result[next_score_i], "success")) { - - # Current posteam Two Point Conversion - if (identical(pbp_df$posteam[play_i], pbp_df$posteam[next_score_i])) { - - score_type <- "Two_Point_Conversion" - - # Opponent Two Point Conversion - } else { - - score_type <- "Opp_Two_Point_Conversion" - - } - - # 7: Defensive Two Point (like returns) - } else if (identical(pbp_df$defensive_two_point_conv[next_score_i], 1)) { - - if (identical(pbp_df$posteam[play_i], pbp_df$posteam[next_score_i])) { - - score_type <- "Opp_Defensive_Two_Point" - - } else { - - score_type <- "Defensive_Two_Point" - - } - - # 8: Errors of some sort so return NA (but shouldn't take place) - } else { - - score_type <- NA - - } - } - - return(data.frame(Next_Score_Half = score_type, - Drive_Score_Half = score_drive)) - } - - # Using lapply and then bind_rows is much faster than - # using map_dfr() here: - lapply(c(1:nrow(pbp_dataset)), find_next_score, - score_plays_i = score_plays, pbp_df = pbp_dataset) %>% - bind_rows() %>% - return -} - diff --git a/data-raw/MODEL-README.Rmd b/data-raw/MODEL-README.Rmd deleted file mode 100644 index 79ba100a..00000000 --- a/data-raw/MODEL-README.Rmd +++ /dev/null @@ -1,716 +0,0 @@ ---- -title: "nflfastR EP, WP, and CP models" -author: Ben Baldwin -output: - github_document: - toc: true - toc_depth: 3 ---- - -```{r opts, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - fig.path = "man/figures/README-", - out.width = "100%" -) -``` - -## About - -This page describes the nflfastR Expected Points (EP), Win Probability (WP), and Completion Percentage (CP) models before showing that they are well calibrated using the procedure [introduced by Yurko, Ventura, and Horowitz](https://arxiv.org/pdf/1802.00998.pdf). Because the 2020 season will mark 22 seasons of nflfastR data, the main purpose behind creating new models for EP and WP was to build in era adjustments to fascilitate better cross-era comparisons. However, we also discovered that switching to tree-based methods could improve model calibration, especially for end-of-half situations with complicated nonlinear interactions between variables. Because we are introducing new models, we compare our calibation results to nflscrapR to show that these new models are somewhat better calibrated. If they weren't, there would be no point in updating the models! - -nflfastR switching from the nflscrapR EP and WP models to its own model should not be thought of as a criticism of nflscrapR: the improvements are relatively minor and nflscrapR provided the code base to perform much of this analysis, breaking new ground in the process. - -## Model features - -The EP, WP, and CP models are trained using xgboost, which uses training data to create [decision trees](https://xgboost.readthedocs.io/en/latest/tutorials/model.html). - -**EP model features** - -* Seconds remaining in half -* Yard line -* Whether possession team is at home -* Roof type: retractable, dome, or outdoors -* Down -* Yards to go -* Era: 1999-2001 (pre-expansion), 2002-2005 (pre-CPOE), 2006-2013 (pre-LOB rules change), 2014-2017, 2018 and beyond -* Week (the model is trained on regular season only, so for playoff games, we impute week = 17) -* Timeouts remaining for each team - -**WP model features** - -* Seconds remaining in half -* Seconds remaining in game -* Yard line -* Expected Points -* Score differential -* Ratio of expected score differential (expected points + point differential) to time remaining (feature borrowed from nflscrapR) -* Down -* Yards to go -* Timeouts remaining for each team -* Whether team will receive 2nd half kickoff -* Whether possession team is at home -* [Model with Vegas line only: point spread * log(3600 / (50 + (seconds elapsed in game))] - -**CP model features** - -* Yard line -* Whether possession team is at home -* Roof type: retractable, dome, or outdoors -* Down -* Yards to go -* Distance to sticks (air yards - yards to go) -* Era: 2006-2013, 2014-2017, 2018 and beyond (note that air yards data only go back to 2006, so there is no CP for earlier years) -* Week (the model is trained on regular season only, so for playoff games, we impute week = 17) -* Air yards -* Whether air yards is 0 (probably unnecessary with tree-based method and a relic from earlier models where it was included because completion percentage is much lower for 0 air yard passes) -* Pass location (binary: middle or not middle) -* Whether quarterback was hit on the play - -## EP Model Calibration Results - -The goal of this section is to show that the nflfastR EP model is well calibrated. To measure calibration, we follow Yurko et al. and perform leave-one-season-out (LOSO) calibration. In particular, for each of the 20 available seasons (2000-2019), we exclude one season, train the EP model on the other 19 seasons, and then compare the model's predictions in the holdout season to what actually happened in that season. If the model is well calibrated, we would expect that, for example, 50 percent of plays with a touchdown probability of 50 percent prior to the play would have the next score be a touchdown for the possession team. - -Let's start with some setup. The file used here isn't pushed because it's large, but its creation can [be seen here](https://github.com/mrcaseb/fastscraper/blob/ben-upstream/data-raw/MODELS.R). -```{r setup, results = 'hide', message = FALSE} -set.seed(2013) #GoHawks -library(tidyverse) -library(xgboost) - -#some helper files are in these -source('../R/helper_add_nflscrapr_mutations.R') -source('../R/helper_add_ep_wp.R') -source('../R/helper_add_cp_cpoe.R') - -pbp_data <- readRDS('cal_data.rds') -model_data <- pbp_data %>% - #in '../R/helper_add_nflscrapr_mutations.R' - make_model_mutations() %>% - mutate( - label = case_when( - Next_Score_Half == "Touchdown" ~ 0, - Next_Score_Half == "Opp_Touchdown" ~ 1, - Next_Score_Half == "Field_Goal" ~ 2, - Next_Score_Half == "Opp_Field_Goal" ~ 3, - Next_Score_Half == "Safety" ~ 4, - Next_Score_Half == "Opp_Safety" ~ 5, - Next_Score_Half == "No_Score" ~ 6 - ), - label = as.factor(label), - #use nflscrapR weights - Drive_Score_Dist = Drive_Score_Half - drive, - Drive_Score_Dist_W = (max(Drive_Score_Dist) - Drive_Score_Dist) / - (max(Drive_Score_Dist) - min(Drive_Score_Dist)), - ScoreDiff_W = (max(abs(score_differential), na.rm=T) - abs(score_differential)) / - (max(abs(score_differential), na.rm=T) - min(abs(score_differential), na.rm=T)), - Total_W = Drive_Score_Dist_W + ScoreDiff_W, - Total_W_Scaled = (Total_W - min(Total_W, na.rm=T)) / - (max(Total_W, na.rm=T) - min(Total_W, na.rm=T)) - ) %>% - filter( - !is.na(defteam_timeouts_remaining), !is.na(posteam_timeouts_remaining), - !is.na(yardline_100) - ) %>% - select( - label, - season, - half_seconds_remaining, - yardline_100, - home, - retractable, - dome, - outdoors, - ydstogo, - era0, era1, era2, era3, era4, - down1, down2, down3, down4, - posteam_timeouts_remaining, - defteam_timeouts_remaining, - model_week, - Total_W_Scaled - ) - -#idk why this is all necessary for xgb but it is -model_data <- model_data %>% - mutate(label = as.numeric(label), - label = label - 1) - -rm(pbp_data) - -seasons <- unique(model_data$season) -``` - -Input the stuff we'll need to fit the model. The parameters were obtained from cross-validation, where each season was forced to be entirely contained in a given CV fold to prevent leakage in labels from one fold to another (for example, if a given drive were split up between folds). -```{r params} -nrounds = 70 -params <- - list( - booster = "gbtree", - objective = "multi:softprob", - eval_metric = c("mlogloss"), - num_class = 7, - eta = 0.2, - gamma = .2, - subsample=0.8, - colsample_bytree=0.8, - max_depth = 4, - min_child_weight = .9 - ) -``` - -Now do the LOSO model fitting. -```{r loso_fit, results = 'hide'} -cv_results <- map_dfr(seasons, function(x) { - - test_data <- model_data %>% - filter(season == x) %>% - select(-season) - train_data <- model_data %>% - filter(season != x) %>% - select(-season) - - full_train = xgboost::xgb.DMatrix(model.matrix(~.+0, data = train_data %>% select(-label, -Total_W_Scaled)), - label = train_data$label, weight = train_data$Total_W_Scaled) - ep_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2) - - preds <- as.data.frame( - matrix(predict(ep_model, as.matrix(test_data %>% select(-label, -Total_W_Scaled))), ncol=7, byrow=TRUE) - ) - colnames(preds) <- c("Touchdown","Opp_Touchdown","Field_Goal","Opp_Field_Goal", - "Safety","Opp_Safety","No_Score") - - cv_data <- bind_cols(test_data, preds) %>% mutate(season = x) - return(cv_data) - -}) - -#get the BINS for the calibration plot -plot <- cv_results %>% - select(Touchdown, Opp_Touchdown, Field_Goal, Opp_Field_Goal, Safety, Opp_Safety, No_Score, label) %>% - pivot_longer(-label, names_to = 'type', values_to = 'pred_prob') %>% - mutate(bin_pred_prob = round(pred_prob / 0.05) * .05) %>% - mutate(outcome = case_when( - label == 0 ~ "Touchdown", - label == 1 ~ "Opp_Touchdown", - label == 2 ~ "Field_Goal", - label == 3 ~ "Opp_Field_Goal", - label == 4 ~ "Safety", - label == 5 ~ "Opp_Safety", - label == 6 ~ "No_Score" - )) %>% - group_by(type, bin_pred_prob) %>% - mutate(correct = if_else(outcome == type, 1, 0)) %>% - summarize(n_plays = n(), - n_outcome = sum(correct), - bin_actual_prob = n_outcome / n_plays) -``` - -Here is the EP calibration plot. Points close to the diagonal dotted line are consistent with a well-calibrated model: - -```{r plot, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'all', dpi = 700} -ann_text <- data.frame(x = c(.25, 0.75), y = c(0.75, 0.25), - lab = c("More times\nthan expected", "Fewer times\nthan expected"), - next_score_type = factor("No Score (0)")) -plot %>% - #about .75M plays in total - #filter(n_plays >= 50) %>% - ungroup() %>% - mutate(type = fct_relevel(type, - "Opp_Safety", "Opp_Field_Goal", - "Opp_Touchdown", "No_Score", "Safety", - "Field_Goal", "Touchdown" - ), - type = fct_recode(type, - "-Field Goal (-3)" = "Opp_Field_Goal", - "-Safety (-2)" = "Opp_Safety", - "-Touchdown (-7)" = "Opp_Touchdown", - "Field Goal (3)" = "Field_Goal", - "No Score (0)" = "No_Score", - "Touchdown (7)" = "Touchdown", - "Safety (2)" = "Safety")) %>% - ggplot() + - geom_point(aes(x = bin_pred_prob, y = bin_actual_prob, size = n_plays)) + - geom_smooth(aes(x = bin_pred_prob, y = bin_actual_prob), method = "loess") + - geom_abline(slope = 1, intercept = 0, color = "black", lty = 2) + - coord_equal() + - scale_x_continuous(limits = c(0,1)) + - scale_y_continuous(limits = c(0,1)) + - labs(size = "Number of plays", - x = "Estimated next score probability", - y = "Observed next score probability") + - geom_text(data = ann_text, aes(x = x, y = y, label = lab), size = 2) + - theme_bw() + - theme(plot.title = element_text(hjust = 0.5), - strip.background = element_blank(), - strip.text = element_text(size = 12), - axis.title = element_text(size = 12), - axis.text.y = element_text(size = 12), - axis.text.x = element_text(size = 10, angle = 90), - legend.title = element_text(size = 12), - legend.text = element_text(size = 12), - legend.position = c(1, .05), legend.justification = c(1, 0)) + - facet_wrap(~ type, ncol = 4) -``` - -There is some weirdness with the opponent safety predictions, but these dots represent an *extremely* small number of plays (10-50 plays out of about 750,000). - -Now let's get the calibration error using the measure developed in Yurko et al., and compare it to nflscrapR. First we need to get the nflscrapR predictions, which we have saved from the previous version of nflfastR which applied the nflscrapR models. - -```{r cal} -#calibration error -cv_cal_error <- plot %>% - ungroup() %>% - mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% - group_by(type) %>% - summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), - n_scoring_event = sum(n_outcome, na.rm = TRUE)) - -pbp_data <- readRDS('cal_data_nflscrapr.rds') -#nflscrapr calibration error -nflscrapr <- pbp_data %>% - select(td_prob, opp_td_prob, fg_prob, opp_fg_prob, safety_prob, opp_safety_prob, no_score_prob, Next_Score_Half) %>% - pivot_longer(-Next_Score_Half, names_to = 'type', values_to = 'pred_prob') %>% - mutate(bin_pred_prob = round(pred_prob / 0.05) * .05) %>% - mutate(outcome = Next_Score_Half, - type = case_when( - type == "td_prob" ~ 'Touchdown', - type == 'fg_prob' ~ "Field_Goal", - type == "opp_td_prob" ~ "Opp_Touchdown", - type == 'opp_fg_prob' ~ "Opp_Field_Goal", - type == 'safety_prob' ~ "Safety", - type == 'opp_safety_prob' ~ "Opp_Safety", - type == "no_score_prob" ~ "No_Score" - )) %>% - group_by(type, bin_pred_prob) %>% - mutate(correct = if_else(outcome == type, 1, 0)) %>% - summarize(n_plays = n(), - n_outcome = sum(correct), - bin_actual_prob = n_outcome / n_plays) %>% - ungroup() %>% - mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% - group_by(type) %>% - summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), - n_scoring_event = sum(n_outcome, na.rm = TRUE)) -rm(pbp_data) - -message(glue::glue( -' ---CALIBRATION ERROR-- - -nflfastR: -{round(with(cv_cal_error, weighted.mean(weight_cal_error, n_scoring_event)), 4)} - -nflscrapR: -{round(with(nflscrapr, weighted.mean(weight_cal_error, n_scoring_event)), 4)} -' -)) -``` -We see that the new EP model is better calibrated. Note that nflscrapR reports a calibration error of 0.01309723. The number is higher here because of the additional seasons included outside of the time period nflscrapR was trained on, and the lack of era adjustment in nflscrapR. - -## WP Model Calibration Results - -As with EP, do some initial setup to get the data ready for fitting. -```{r wp_setup} -model_data <- readRDS('cal_data.rds') -model_data <- model_data %>% - make_model_mutations() %>% - prepare_wp_data() %>% - mutate(label = ifelse(posteam == Winner, 1, 0)) %>% - filter(!is.na(ep) & !is.na(score_differential) & !is.na(play_type) & !is.na(label)) %>% - select( - label, - receive_2h_ko, - spread_time, - half_seconds_remaining, - game_seconds_remaining, - ExpScoreDiff_Time_Ratio, - score_differential, - ep, - down, - ydstogo, - home, - posteam_timeouts_remaining, - defteam_timeouts_remaining, - season, - #only needed for the plots here, not used in model - qtr - ) %>% - filter(qtr <= 4) - -nrounds = 65 -params <- - list( - booster = "gbtree", - objective = "binary:logistic", - eval_metric = c("logloss"), - eta = 0.2, - gamma = 0, - subsample=0.8, - colsample_bytree=0.8, - max_depth = 4, - min_child_weight = 1 - ) -``` - -Do the LOSO fitting: -``` {r wp_loso, results = 'hide'} -cv_results <- map_dfr(seasons, function(x) { - - test_data <- model_data %>% - filter(season == x) %>% - select(-season) - train_data <- model_data %>% - filter(season != x) %>% - select(-season) - - full_train = xgboost::xgb.DMatrix(model.matrix(~.+0, data = train_data %>% select(-label, -qtr, -spread_time)), - label = train_data$label) - wp_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2) - - preds <- as.data.frame( - matrix(predict(wp_model, as.matrix(test_data %>% select(-label, -qtr, -spread_time)))) - ) %>% - dplyr::rename(wp = V1) - - cv_data <- bind_cols(test_data, preds) %>% mutate(season = x) - return(cv_data) - -}) - -#TIME FOR BINNING -wp_cv_loso_calibration_results <- cv_results %>% - # Create BINS for wp: - mutate(bin_pred_prob = round(wp / 0.05) * .05) %>% - # Group by both the qtr and bin_pred_prob: - group_by(qtr, bin_pred_prob) %>% - # Calculate the calibration results: - summarize(n_plays = n(), - n_wins = length(which(label == 1)), - bin_actual_prob = n_wins / n_plays) - -``` - -The WP plot. Looks good! -```{r plot_wp, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'all', dpi = 700} -# Create a label data frame for the chart: -ann_text <- data.frame(x = c(.25, 0.75), y = c(0.75, 0.25), - lab = c("More times\nthan expected", "Fewer times\nthan expected"), - qtr = factor("1st Quarter")) - -# Create the calibration chart: -wp_cv_loso_calibration_results %>% - ungroup() %>% - mutate(qtr = fct_recode(factor(qtr), "1st Quarter" = "1", "2nd Quarter" = "2", - "3rd Quarter" = "3", "4th Quarter" = "4")) %>% - ggplot() + - geom_point(aes(x = bin_pred_prob, y = bin_actual_prob, size = n_plays)) + - geom_smooth(aes(x = bin_pred_prob, y = bin_actual_prob), method = "loess") + - geom_abline(slope = 1, intercept = 0, color = "black", lty = 2) + - coord_equal() + - scale_x_continuous(limits = c(0,1)) + - scale_y_continuous(limits = c(0,1)) + - labs(size = "Number of plays", - x = "Estimated win probability", - y = "Observed win probability") + - geom_text(data = ann_text, aes(x = x, y = y, label = lab), size = 2) + - theme_bw() + - theme(plot.title = element_text(hjust = 0.5), - strip.background = element_blank(), - strip.text = element_text(size = 12), - axis.title = element_text(size = 12), - axis.text.y = element_text(size = 12), - axis.text.x = element_text(size = 10, angle = 90), - legend.title = element_text(size = 12), - legend.text = element_text(size = 12), - legend.position = "bottom") + - facet_wrap(~ qtr, ncol = 4) -``` - -And get the WP calibration error: -```{r cal_error_wp} -# Calculate the calibration error values: -wp_cv_cal_error <- wp_cv_loso_calibration_results %>% - ungroup() %>% - mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% - group_by(qtr) %>% - summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), - n_wins = sum(n_wins, na.rm = TRUE)) - -#get nflscrapR to compare -pbp_data <- readRDS('cal_data_nflscrapr.rds') %>% - mutate(label = ifelse(posteam == Winner, 1, 0)) %>% - filter(qtr <= 4, !is.na(label), !is.na(posteam), !is.na(wp)) - -nflscrapR <- pbp_data %>% - # Create binned probability column: - mutate(bin_pred_prob = round(wp / 0.05) * .05) %>% - # Group by both the qtr and bin_pred_prob: - group_by(qtr, bin_pred_prob) %>% - # Calculate the calibration results: - summarize(n_plays = n(), - n_wins = length(which(label == 1)), - bin_actual_prob = n_wins / n_plays) %>% - ungroup() %>% - mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% - group_by(qtr) %>% - summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), - n_wins = sum(n_wins, na.rm = TRUE)) - -message(glue::glue( - '--CALIBRATION ERROR-- - -nflfastR: -{round(with(wp_cv_cal_error, weighted.mean(weight_cal_error, n_wins)), 4)} - -nflscrapR: -{round(with(nflscrapR, weighted.mean(weight_cal_error, n_wins)), 4)}' -)) -``` -Again, the new WP model represents an improvement. - - -## WP Model Calibration Results: with point spread - -`nflfastR` has a secondary win probability model that also incorporates the pregame spread to more accurately reflect a team's chances of winning. Below are calibration results for this model. - -```{r wp_setup_spread} -nrounds = 170 -params <- - list( - booster = "gbtree", - objective = "binary:logistic", - eval_metric = c("logloss"), - eta = 0.075, - gamma = 3, - subsample=0.8, - colsample_bytree=0.8, - max_depth = 5, - min_child_weight = .9 - ) -``` - -Do the LOSO fitting: -``` {r wp_loso_spread, results = 'hide'} -cv_results <- map_dfr(seasons, function(x) { - - test_data <- model_data %>% - filter(season == x) %>% - select(-season) - train_data <- model_data %>% - filter(season != x) %>% - select(-season) - - full_train = xgboost::xgb.DMatrix(model.matrix(~.+0, data = train_data %>% select(-label, -qtr)), - label = train_data$label) - wp_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2) - - preds <- as.data.frame( - matrix(predict(wp_model, as.matrix(test_data %>% select(-label, -qtr)))) - ) %>% - dplyr::rename(wp = V1) - - cv_data <- bind_cols(test_data, preds) %>% mutate(season = x) - return(cv_data) - -}) - -#TIME FOR BINNING -wp_cv_loso_calibration_results <- cv_results %>% - # Create BINS for wp: - mutate(bin_pred_prob = round(wp / 0.05) * .05) %>% - # Group by both the qtr and bin_pred_prob: - group_by(qtr, bin_pred_prob) %>% - # Calculate the calibration results: - summarize(n_plays = n(), - n_wins = length(which(label == 1)), - bin_actual_prob = n_wins / n_plays) - -``` - -The WP plot. -```{r plot_wp_spread, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'all', dpi = 700} -# Create a label data frame for the chart: -ann_text <- data.frame(x = c(.25, 0.75), y = c(0.75, 0.25), - lab = c("More times\nthan expected", "Fewer times\nthan expected"), - qtr = factor("1st Quarter")) - -# Create the calibration chart: -wp_cv_loso_calibration_results %>% - ungroup() %>% - mutate(qtr = fct_recode(factor(qtr), "1st Quarter" = "1", "2nd Quarter" = "2", - "3rd Quarter" = "3", "4th Quarter" = "4")) %>% - ggplot() + - geom_point(aes(x = bin_pred_prob, y = bin_actual_prob, size = n_plays)) + - geom_smooth(aes(x = bin_pred_prob, y = bin_actual_prob), method = "loess") + - geom_abline(slope = 1, intercept = 0, color = "black", lty = 2) + - coord_equal() + - scale_x_continuous(limits = c(0,1)) + - scale_y_continuous(limits = c(0,1)) + - labs(size = "Number of plays", - x = "Estimated win probability", - y = "Observed win probability") + - geom_text(data = ann_text, aes(x = x, y = y, label = lab), size = 2) + - theme_bw() + - theme(plot.title = element_text(hjust = 0.5), - strip.background = element_blank(), - strip.text = element_text(size = 12), - axis.title = element_text(size = 12), - axis.text.y = element_text(size = 12), - axis.text.x = element_text(size = 10, angle = 90), - legend.title = element_text(size = 12), - legend.text = element_text(size = 12), - legend.position = "bottom") + - facet_wrap(~ qtr, ncol = 4) -``` - -And get the WP calibration error: -```{r cal_error_wp_spread} -# Calculate the calibration error values: -wp_cv_cal_error <- wp_cv_loso_calibration_results %>% - ungroup() %>% - mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% - group_by(qtr) %>% - summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), - n_wins = sum(n_wins, na.rm = TRUE)) -message(glue::glue( - '--CALIBRATION ERROR-- - -nflfastR with Vegas line: -{round(with(wp_cv_cal_error, weighted.mean(weight_cal_error, n_wins)), 4)} - -nflscrapR: -{round(with(nflscrapR, weighted.mean(weight_cal_error, n_wins)), 4)}' -)) -``` -Again, the new WP model is better calibrated than nflscrapR. In our testing, incorporating the spread substantially improved the performance of the model as measured by cross-validation classification accuracy (reduced error rate from 27% to 23%) and log loss (reduced from .52 to .45). We include a time-decaying function of spread on its own as including spread on its own increases the LOSO calibration error, especially in the fourth quarter. We also tried removing the `home` indicator in the spread model, but this worsened the calibration results. - -## CP Model Calibration Results - -By now, the process should be familiar. -``` {r cp-setup, results = 'hide'} -pbp <- readRDS('cal_data.rds') - -model_data <- pbp %>% - filter(season >= 2006) %>% - make_model_mutations() %>% - dplyr::mutate(receiver_player_name = - stringr::str_extract(desc, "(?<=((to)|(for))\\s[:digit:]{0,2}\\-{0,1})[A-Z][A-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?"), - pass_middle = dplyr::if_else(pass_location == 'middle', 1, 0), - air_is_zero= dplyr::if_else(air_yards == 0,1,0), - distance_to_sticks = air_yards - ydstogo - ) %>% - dplyr::filter(complete_pass == 1 | incomplete_pass == 1 | interception == 1) %>% - dplyr::filter(!is.na(air_yards) & air_yards >= -15 & air_yards <70 & !is.na(receiver_player_name) & !is.na(pass_location)) %>% - dplyr::select( - season, complete_pass, air_yards, yardline_100, ydstogo, - down1, down2, down3, down4, air_is_zero, pass_middle, - era2, era3, era4, qb_hit, home, model_week, - outdoors, retractable, dome, distance_to_sticks - ) -rm(pbp) - - -nrounds = 70 -params <- - list( - booster = "gbtree", - objective = "binary:logistic", - eval_metric = c("logloss"), - eta = 0.2, - gamma = 5, - subsample=0.8, - colsample_bytree=0.8, - max_depth = 4, - min_child_weight = 6, - base_score = mean(model_data$complete_pass) - ) - -cv_results <- map_dfr(2006:2019, function(x) { - - test_data <- model_data %>% - filter(season == x) %>% - select(-season) - train_data <- model_data %>% - filter(season != x) %>% - select(-season) - - full_train = xgboost::xgb.DMatrix(model.matrix(~.+0, data = train_data %>% select(-complete_pass)), - label = train_data$complete_pass) - cp_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2) - - preds <- as.data.frame( - matrix(predict(cp_model, as.matrix(test_data %>% select(-complete_pass)))) - ) %>% - dplyr::rename(cp = V1) - - cv_data <- bind_cols(test_data, preds) %>% mutate(season = x) - return(cv_data) - -}) - -#TIME FOR BINNING -cp_cv_loso_calibration_results <- cv_results %>% - # Create BINS for wp: - mutate( - bin_pred_prob = round(cp / 0.05) * .05, - distance = case_when( - air_yards < 5 ~ "Short", - air_yards >= 5 & air_yards < 15 ~ "Intermediate", - air_yards >= 15 ~ "Deep" - ) - ) %>% - # Group by both the qtr and bin_pred_prob: - group_by(distance, bin_pred_prob) %>% - # Calculate the calibration results: - summarize(n_plays = n(), - n_complete = length(which(complete_pass == 1)), - bin_actual_prob = n_complete / n_plays) - -ann_text <- data.frame(x = c(.25, 0.75), y = c(0.75, 0.25), - lab = c("More times\nthan expected", "Fewer times\nthan expected") - ) -``` - -Plot the results: -```{r plot_cp, warning = FALSE, message = FALSE, results = 'hide', fig.keep = 'all', dpi = 700} -cp_cv_loso_calibration_results %>% - ungroup() %>% - mutate(distance = fct_relevel(distance, - "Short", "Intermediate", "Deep") - ) %>% - filter(n_plays > 10) %>% - ggplot() + - geom_point(aes(x = bin_pred_prob, y = bin_actual_prob, size = n_plays)) + - geom_smooth(aes(x = bin_pred_prob, y = bin_actual_prob), method = "loess") + - geom_abline(slope = 1, intercept = 0, color = "black", lty = 2) + - coord_equal() + - scale_x_continuous(limits = c(0,1)) + - scale_y_continuous(limits = c(0,1)) + - labs(size = "Number of plays", - x = "Estimated completion percentage", - y = "Observed completion percentage") + - geom_text(data = ann_text, aes(x = x, y = y, label = lab), size = 3) + - theme_bw() + - theme(plot.title = element_text(hjust = 0.5), - strip.background = element_blank(), - strip.text = element_text(size = 12), - axis.title = element_text(size = 12), - axis.text.y = element_text(size = 12), - axis.text.x = element_text(size = 10, angle = 90), - legend.title = element_text(size = 12), - legend.text = element_text(size = 12), - legend.position = "bottom") + - facet_wrap(~ distance, ncol = 3) -``` - -And get the calibration error: -```{r cp-error} -cp_cv_cal_error <- cp_cv_loso_calibration_results %>% - ungroup() %>% - mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% - group_by(distance) %>% - summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), - n_complete = sum(n_complete, na.rm = TRUE)) - -round(with(cp_cv_cal_error, weighted.mean(weight_cal_error, n_complete)), 4) -``` diff --git a/data-raw/MODEL-README.md b/data-raw/MODEL-README.md deleted file mode 100644 index a7f8852a..00000000 --- a/data-raw/MODEL-README.md +++ /dev/null @@ -1,810 +0,0 @@ -nflfastR EP, WP, and CP models -================ -Ben Baldwin - - - [About](#about) - - [Model features](#model-features) - - [EP Model Calibration Results](#ep-model-calibration-results) - - [WP Model Calibration Results](#wp-model-calibration-results) - - [WP Model Calibration Results: with point - spread](#wp-model-calibration-results-with-point-spread) - - [CP Model Calibration Results](#cp-model-calibration-results) - -## About - -This page describes the nflfastR Expected Points (EP), Win Probability -(WP), and Completion Percentage (CP) models before showing that they are -well calibrated using the procedure [introduced by Yurko, Ventura, and -Horowitz](https://arxiv.org/pdf/1802.00998.pdf). Because the 2020 season -will mark 22 seasons of nflfastR data, the main purpose behind creating -new models for EP and WP was to build in era adjustments to fascilitate -better cross-era comparisons. However, we also discovered that switching -to tree-based methods could improve model calibration, especially for -end-of-half situations with complicated nonlinear interactions between -variables. Because we are introducing new models, we compare our -calibation results to nflscrapR to show that these new models are -somewhat better calibrated. If they weren’t, there would be no point in -updating the models\! - -nflfastR switching from the nflscrapR EP and WP models to its own model -should not be thought of as a criticism of nflscrapR: the improvements -are relatively minor and nflscrapR provided the code base to perform -much of this analysis, breaking new ground in the process. - -## Model features - -The EP, WP, and CP models are trained using xgboost, which uses training -data to create [decision -trees](https://xgboost.readthedocs.io/en/latest/tutorials/model.html). - -**EP model features** - - - Seconds remaining in half - - Yard line - - Whether possession team is at home - - Roof type: retractable, dome, or outdoors - - Down - - Yards to go - - Era: 1999-2001 (pre-expansion), 2002-2005 (pre-CPOE), 2006-2013 - (pre-LOB rules change), 2014-2017, 2018 and beyond - - Week (the model is trained on regular season only, so for playoff - games, we impute week = 17) - - Timeouts remaining for each team - -**WP model features** - - - Seconds remaining in half - - Seconds remaining in game - - Yard line - - Expected Points - - Score differential - - Ratio of expected score differential (expected points + point - differential) to time remaining (feature borrowed from nflscrapR) - - Down - - Yards to go - - Timeouts remaining for each team - - Whether team will receive 2nd half kickoff - - Whether possession team is at home - - \[Model with Vegas line only: point spread \* log(3600 / (50 + - (seconds elapsed in game))\] - -**CP model features** - - - Yard line - - Whether possession team is at home - - Roof type: retractable, dome, or outdoors - - Down - - Yards to go - - Distance to sticks (air yards - yards to go) - - Era: 2006-2013, 2014-2017, 2018 and beyond (note that air yards data - only go back to 2006, so there is no CP for earlier years) - - Week (the model is trained on regular season only, so for playoff - games, we impute week = 17) - - Air yards - - Whether air yards is 0 (probably unnecessary with tree-based method - and a relic from earlier models where it was included because - completion percentage is much lower for 0 air yard passes) - - Pass location (binary: middle or not middle) - - Whether quarterback was hit on the play - -## EP Model Calibration Results - -The goal of this section is to show that the nflfastR EP model is well -calibrated. To measure calibration, we follow Yurko et al. and perform -leave-one-season-out (LOSO) calibration. In particular, for each of the -20 available seasons (2000-2019), we exclude one season, train the EP -model on the other 19 seasons, and then compare the model’s predictions -in the holdout season to what actually happened in that season. If the -model is well calibrated, we would expect that, for example, 50 percent -of plays with a touchdown probability of 50 percent prior to the play -would have the next score be a touchdown for the possession team. - -Let’s start with some setup. The file used here isn’t pushed because -it’s large, but its creation can [be seen -here](https://github.com/mrcaseb/fastscraper/blob/ben-upstream/data-raw/MODELS.R). - -``` r -set.seed(2013) #GoHawks -library(tidyverse) -library(xgboost) - -#some helper files are in these -source('../R/helper_add_nflscrapr_mutations.R') -source('../R/helper_add_ep_wp.R') -source('../R/helper_add_cp_cpoe.R') - -pbp_data <- readRDS('cal_data.rds') -model_data <- pbp_data %>% - #in '../R/helper_add_nflscrapr_mutations.R' - make_model_mutations() %>% - mutate( - label = case_when( - Next_Score_Half == "Touchdown" ~ 0, - Next_Score_Half == "Opp_Touchdown" ~ 1, - Next_Score_Half == "Field_Goal" ~ 2, - Next_Score_Half == "Opp_Field_Goal" ~ 3, - Next_Score_Half == "Safety" ~ 4, - Next_Score_Half == "Opp_Safety" ~ 5, - Next_Score_Half == "No_Score" ~ 6 - ), - label = as.factor(label), - #use nflscrapR weights - Drive_Score_Dist = Drive_Score_Half - drive, - Drive_Score_Dist_W = (max(Drive_Score_Dist) - Drive_Score_Dist) / - (max(Drive_Score_Dist) - min(Drive_Score_Dist)), - ScoreDiff_W = (max(abs(score_differential), na.rm=T) - abs(score_differential)) / - (max(abs(score_differential), na.rm=T) - min(abs(score_differential), na.rm=T)), - Total_W = Drive_Score_Dist_W + ScoreDiff_W, - Total_W_Scaled = (Total_W - min(Total_W, na.rm=T)) / - (max(Total_W, na.rm=T) - min(Total_W, na.rm=T)) - ) %>% - filter( - !is.na(defteam_timeouts_remaining), !is.na(posteam_timeouts_remaining), - !is.na(yardline_100) - ) %>% - select( - label, - season, - half_seconds_remaining, - yardline_100, - home, - retractable, - dome, - outdoors, - ydstogo, - era0, era1, era2, era3, era4, - down1, down2, down3, down4, - posteam_timeouts_remaining, - defteam_timeouts_remaining, - model_week, - Total_W_Scaled - ) - -#idk why this is all necessary for xgb but it is -model_data <- model_data %>% - mutate(label = as.numeric(label), - label = label - 1) - -rm(pbp_data) - -seasons <- unique(model_data$season) -``` - -Input the stuff we’ll need to fit the model. The parameters were -obtained from cross-validation, where each season was forced to be -entirely contained in a given CV fold to prevent leakage in labels from -one fold to another (for example, if a given drive were split up between -folds). - -``` r -nrounds = 70 -params <- - list( - booster = "gbtree", - objective = "multi:softprob", - eval_metric = c("mlogloss"), - num_class = 7, - eta = 0.2, - gamma = .2, - subsample=0.8, - colsample_bytree=0.8, - max_depth = 4, - min_child_weight = .9 - ) -``` - -Now do the LOSO model fitting. - -``` r -cv_results <- map_dfr(seasons, function(x) { - - test_data <- model_data %>% - filter(season == x) %>% - select(-season) - train_data <- model_data %>% - filter(season != x) %>% - select(-season) - - full_train = xgboost::xgb.DMatrix(model.matrix(~.+0, data = train_data %>% select(-label, -Total_W_Scaled)), - label = train_data$label, weight = train_data$Total_W_Scaled) - ep_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2) - - preds <- as.data.frame( - matrix(predict(ep_model, as.matrix(test_data %>% select(-label, -Total_W_Scaled))), ncol=7, byrow=TRUE) - ) - colnames(preds) <- c("Touchdown","Opp_Touchdown","Field_Goal","Opp_Field_Goal", - "Safety","Opp_Safety","No_Score") - - cv_data <- bind_cols(test_data, preds) %>% mutate(season = x) - return(cv_data) - -}) - -#get the BINS for the calibration plot -plot <- cv_results %>% - select(Touchdown, Opp_Touchdown, Field_Goal, Opp_Field_Goal, Safety, Opp_Safety, No_Score, label) %>% - pivot_longer(-label, names_to = 'type', values_to = 'pred_prob') %>% - mutate(bin_pred_prob = round(pred_prob / 0.05) * .05) %>% - mutate(outcome = case_when( - label == 0 ~ "Touchdown", - label == 1 ~ "Opp_Touchdown", - label == 2 ~ "Field_Goal", - label == 3 ~ "Opp_Field_Goal", - label == 4 ~ "Safety", - label == 5 ~ "Opp_Safety", - label == 6 ~ "No_Score" - )) %>% - group_by(type, bin_pred_prob) %>% - mutate(correct = if_else(outcome == type, 1, 0)) %>% - summarize(n_plays = n(), - n_outcome = sum(correct), - bin_actual_prob = n_outcome / n_plays) -``` - -Here is the EP calibration plot. Points close to the diagonal dotted -line are consistent with a well-calibrated model: - -``` r -ann_text <- data.frame(x = c(.25, 0.75), y = c(0.75, 0.25), - lab = c("More times\nthan expected", "Fewer times\nthan expected"), - next_score_type = factor("No Score (0)")) -plot %>% - #about .75M plays in total - #filter(n_plays >= 50) %>% - ungroup() %>% - mutate(type = fct_relevel(type, - "Opp_Safety", "Opp_Field_Goal", - "Opp_Touchdown", "No_Score", "Safety", - "Field_Goal", "Touchdown" - ), - type = fct_recode(type, - "-Field Goal (-3)" = "Opp_Field_Goal", - "-Safety (-2)" = "Opp_Safety", - "-Touchdown (-7)" = "Opp_Touchdown", - "Field Goal (3)" = "Field_Goal", - "No Score (0)" = "No_Score", - "Touchdown (7)" = "Touchdown", - "Safety (2)" = "Safety")) %>% - ggplot() + - geom_point(aes(x = bin_pred_prob, y = bin_actual_prob, size = n_plays)) + - geom_smooth(aes(x = bin_pred_prob, y = bin_actual_prob), method = "loess") + - geom_abline(slope = 1, intercept = 0, color = "black", lty = 2) + - coord_equal() + - scale_x_continuous(limits = c(0,1)) + - scale_y_continuous(limits = c(0,1)) + - labs(size = "Number of plays", - x = "Estimated next score probability", - y = "Observed next score probability") + - geom_text(data = ann_text, aes(x = x, y = y, label = lab), size = 2) + - theme_bw() + - theme(plot.title = element_text(hjust = 0.5), - strip.background = element_blank(), - strip.text = element_text(size = 12), - axis.title = element_text(size = 12), - axis.text.y = element_text(size = 12), - axis.text.x = element_text(size = 10, angle = 90), - legend.title = element_text(size = 12), - legend.text = element_text(size = 12), - legend.position = c(1, .05), legend.justification = c(1, 0)) + - facet_wrap(~ type, ncol = 4) -``` - - - -There is some weirdness with the opponent safety predictions, but these -dots represent an *extremely* small number of plays (10-50 plays out of -about 750,000). - -Now let’s get the calibration error using the measure developed in Yurko -et al., and compare it to nflscrapR. First we need to get the nflscrapR -predictions, which we have saved from the previous version of nflfastR -which applied the nflscrapR models. - -``` r -#calibration error -cv_cal_error <- plot %>% - ungroup() %>% - mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% - group_by(type) %>% - summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), - n_scoring_event = sum(n_outcome, na.rm = TRUE)) - -pbp_data <- readRDS('cal_data_nflscrapr.rds') -#nflscrapr calibration error -nflscrapr <- pbp_data %>% - select(td_prob, opp_td_prob, fg_prob, opp_fg_prob, safety_prob, opp_safety_prob, no_score_prob, Next_Score_Half) %>% - pivot_longer(-Next_Score_Half, names_to = 'type', values_to = 'pred_prob') %>% - mutate(bin_pred_prob = round(pred_prob / 0.05) * .05) %>% - mutate(outcome = Next_Score_Half, - type = case_when( - type == "td_prob" ~ 'Touchdown', - type == 'fg_prob' ~ "Field_Goal", - type == "opp_td_prob" ~ "Opp_Touchdown", - type == 'opp_fg_prob' ~ "Opp_Field_Goal", - type == 'safety_prob' ~ "Safety", - type == 'opp_safety_prob' ~ "Opp_Safety", - type == "no_score_prob" ~ "No_Score" - )) %>% - group_by(type, bin_pred_prob) %>% - mutate(correct = if_else(outcome == type, 1, 0)) %>% - summarize(n_plays = n(), - n_outcome = sum(correct), - bin_actual_prob = n_outcome / n_plays) %>% - ungroup() %>% - mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% - group_by(type) %>% - summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), - n_scoring_event = sum(n_outcome, na.rm = TRUE)) -rm(pbp_data) - -message(glue::glue( -' ---CALIBRATION ERROR-- - -nflfastR: -{round(with(cv_cal_error, weighted.mean(weight_cal_error, n_scoring_event)), 4)} - -nflscrapR: -{round(with(nflscrapr, weighted.mean(weight_cal_error, n_scoring_event)), 4)} -' -)) -#> --CALIBRATION ERROR-- -#> -#> nflfastR: -#> 0.0063 -#> -#> nflscrapR: -#> 0.017 -``` - -We see that the new EP model is better calibrated. Note that nflscrapR -reports a calibration error of 0.01309723. The number is higher here -because of the additional seasons included outside of the time period -nflscrapR was trained on, and the lack of era adjustment in nflscrapR. - -## WP Model Calibration Results - -As with EP, do some initial setup to get the data ready for fitting. - -``` r -model_data <- readRDS('cal_data.rds') -model_data <- model_data %>% - make_model_mutations() %>% - prepare_wp_data() %>% - mutate(label = ifelse(posteam == Winner, 1, 0)) %>% - filter(!is.na(ep) & !is.na(score_differential) & !is.na(play_type) & !is.na(label)) %>% - select( - label, - receive_2h_ko, - spread_time, - half_seconds_remaining, - game_seconds_remaining, - ExpScoreDiff_Time_Ratio, - score_differential, - ep, - down, - ydstogo, - home, - posteam_timeouts_remaining, - defteam_timeouts_remaining, - season, - #only needed for the plots here, not used in model - qtr - ) %>% - filter(qtr <= 4) - -nrounds = 65 -params <- - list( - booster = "gbtree", - objective = "binary:logistic", - eval_metric = c("logloss"), - eta = 0.2, - gamma = 0, - subsample=0.8, - colsample_bytree=0.8, - max_depth = 4, - min_child_weight = 1 - ) -``` - -Do the LOSO fitting: - -``` r -cv_results <- map_dfr(seasons, function(x) { - - test_data <- model_data %>% - filter(season == x) %>% - select(-season) - train_data <- model_data %>% - filter(season != x) %>% - select(-season) - - full_train = xgboost::xgb.DMatrix(model.matrix(~.+0, data = train_data %>% select(-label, -qtr, -spread_time)), - label = train_data$label) - wp_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2) - - preds <- as.data.frame( - matrix(predict(wp_model, as.matrix(test_data %>% select(-label, -qtr, -spread_time)))) - ) %>% - dplyr::rename(wp = V1) - - cv_data <- bind_cols(test_data, preds) %>% mutate(season = x) - return(cv_data) - -}) - -#TIME FOR BINNING -wp_cv_loso_calibration_results <- cv_results %>% - # Create BINS for wp: - mutate(bin_pred_prob = round(wp / 0.05) * .05) %>% - # Group by both the qtr and bin_pred_prob: - group_by(qtr, bin_pred_prob) %>% - # Calculate the calibration results: - summarize(n_plays = n(), - n_wins = length(which(label == 1)), - bin_actual_prob = n_wins / n_plays) -``` - -The WP plot. Looks good\! - -``` r -# Create a label data frame for the chart: -ann_text <- data.frame(x = c(.25, 0.75), y = c(0.75, 0.25), - lab = c("More times\nthan expected", "Fewer times\nthan expected"), - qtr = factor("1st Quarter")) - -# Create the calibration chart: -wp_cv_loso_calibration_results %>% - ungroup() %>% - mutate(qtr = fct_recode(factor(qtr), "1st Quarter" = "1", "2nd Quarter" = "2", - "3rd Quarter" = "3", "4th Quarter" = "4")) %>% - ggplot() + - geom_point(aes(x = bin_pred_prob, y = bin_actual_prob, size = n_plays)) + - geom_smooth(aes(x = bin_pred_prob, y = bin_actual_prob), method = "loess") + - geom_abline(slope = 1, intercept = 0, color = "black", lty = 2) + - coord_equal() + - scale_x_continuous(limits = c(0,1)) + - scale_y_continuous(limits = c(0,1)) + - labs(size = "Number of plays", - x = "Estimated win probability", - y = "Observed win probability") + - geom_text(data = ann_text, aes(x = x, y = y, label = lab), size = 2) + - theme_bw() + - theme(plot.title = element_text(hjust = 0.5), - strip.background = element_blank(), - strip.text = element_text(size = 12), - axis.title = element_text(size = 12), - axis.text.y = element_text(size = 12), - axis.text.x = element_text(size = 10, angle = 90), - legend.title = element_text(size = 12), - legend.text = element_text(size = 12), - legend.position = "bottom") + - facet_wrap(~ qtr, ncol = 4) -``` - - - -And get the WP calibration error: - -``` r -# Calculate the calibration error values: -wp_cv_cal_error <- wp_cv_loso_calibration_results %>% - ungroup() %>% - mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% - group_by(qtr) %>% - summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), - n_wins = sum(n_wins, na.rm = TRUE)) - -#get nflscrapR to compare -pbp_data <- readRDS('cal_data_nflscrapr.rds') %>% - mutate(label = ifelse(posteam == Winner, 1, 0)) %>% - filter(qtr <= 4, !is.na(label), !is.na(posteam), !is.na(wp)) - -nflscrapR <- pbp_data %>% - # Create binned probability column: - mutate(bin_pred_prob = round(wp / 0.05) * .05) %>% - # Group by both the qtr and bin_pred_prob: - group_by(qtr, bin_pred_prob) %>% - # Calculate the calibration results: - summarize(n_plays = n(), - n_wins = length(which(label == 1)), - bin_actual_prob = n_wins / n_plays) %>% - ungroup() %>% - mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% - group_by(qtr) %>% - summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), - n_wins = sum(n_wins, na.rm = TRUE)) - -message(glue::glue( - '--CALIBRATION ERROR-- - -nflfastR: -{round(with(wp_cv_cal_error, weighted.mean(weight_cal_error, n_wins)), 4)} - -nflscrapR: -{round(with(nflscrapR, weighted.mean(weight_cal_error, n_wins)), 4)}' -)) -#> --CALIBRATION ERROR-- -#> -#> nflfastR: -#> 0.0059 -#> -#> nflscrapR: -#> 0.0397 -``` - -Again, the new WP model represents an improvement. - -## WP Model Calibration Results: with point spread - -`nflfastR` has a secondary win probability model that also incorporates -the pregame spread to more accurately reflect a team’s chances of -winning. Below are calibration results for this model. - -``` r -nrounds = 170 -params <- - list( - booster = "gbtree", - objective = "binary:logistic", - eval_metric = c("logloss"), - eta = 0.075, - gamma = 3, - subsample=0.8, - colsample_bytree=0.8, - max_depth = 5, - min_child_weight = .9 - ) -``` - -Do the LOSO fitting: - -``` r -cv_results <- map_dfr(seasons, function(x) { - - test_data <- model_data %>% - filter(season == x) %>% - select(-season) - train_data <- model_data %>% - filter(season != x) %>% - select(-season) - - full_train = xgboost::xgb.DMatrix(model.matrix(~.+0, data = train_data %>% select(-label, -qtr)), - label = train_data$label) - wp_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2) - - preds <- as.data.frame( - matrix(predict(wp_model, as.matrix(test_data %>% select(-label, -qtr)))) - ) %>% - dplyr::rename(wp = V1) - - cv_data <- bind_cols(test_data, preds) %>% mutate(season = x) - return(cv_data) - -}) - -#TIME FOR BINNING -wp_cv_loso_calibration_results <- cv_results %>% - # Create BINS for wp: - mutate(bin_pred_prob = round(wp / 0.05) * .05) %>% - # Group by both the qtr and bin_pred_prob: - group_by(qtr, bin_pred_prob) %>% - # Calculate the calibration results: - summarize(n_plays = n(), - n_wins = length(which(label == 1)), - bin_actual_prob = n_wins / n_plays) -``` - -The WP plot. - -``` r -# Create a label data frame for the chart: -ann_text <- data.frame(x = c(.25, 0.75), y = c(0.75, 0.25), - lab = c("More times\nthan expected", "Fewer times\nthan expected"), - qtr = factor("1st Quarter")) - -# Create the calibration chart: -wp_cv_loso_calibration_results %>% - ungroup() %>% - mutate(qtr = fct_recode(factor(qtr), "1st Quarter" = "1", "2nd Quarter" = "2", - "3rd Quarter" = "3", "4th Quarter" = "4")) %>% - ggplot() + - geom_point(aes(x = bin_pred_prob, y = bin_actual_prob, size = n_plays)) + - geom_smooth(aes(x = bin_pred_prob, y = bin_actual_prob), method = "loess") + - geom_abline(slope = 1, intercept = 0, color = "black", lty = 2) + - coord_equal() + - scale_x_continuous(limits = c(0,1)) + - scale_y_continuous(limits = c(0,1)) + - labs(size = "Number of plays", - x = "Estimated win probability", - y = "Observed win probability") + - geom_text(data = ann_text, aes(x = x, y = y, label = lab), size = 2) + - theme_bw() + - theme(plot.title = element_text(hjust = 0.5), - strip.background = element_blank(), - strip.text = element_text(size = 12), - axis.title = element_text(size = 12), - axis.text.y = element_text(size = 12), - axis.text.x = element_text(size = 10, angle = 90), - legend.title = element_text(size = 12), - legend.text = element_text(size = 12), - legend.position = "bottom") + - facet_wrap(~ qtr, ncol = 4) -``` - - - -And get the WP calibration error: - -``` r -# Calculate the calibration error values: -wp_cv_cal_error <- wp_cv_loso_calibration_results %>% - ungroup() %>% - mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% - group_by(qtr) %>% - summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), - n_wins = sum(n_wins, na.rm = TRUE)) -message(glue::glue( - '--CALIBRATION ERROR-- - -nflfastR with Vegas line: -{round(with(wp_cv_cal_error, weighted.mean(weight_cal_error, n_wins)), 4)} - -nflscrapR: -{round(with(nflscrapR, weighted.mean(weight_cal_error, n_wins)), 4)}' -)) -#> --CALIBRATION ERROR-- -#> -#> nflfastR with Vegas line: -#> 0.0059 -#> -#> nflscrapR: -#> 0.0397 -``` - -Again, the new WP model is better calibrated than nflscrapR. In our testing, -incorporating the spread substantially improved the performance of the -model as measured by cross-validation classification accuracy (reduced -error rate from 27% to 23%) and log loss (reduced from .52 to .45). We -include a time-decaying function of spread on its own as including -spread on its own increases the LOSO calibration error, especially in -the fourth quarter. We also tried removing the `home` indicator in the -spread model, but this worsened the calibration results. - -## CP Model Calibration Results - -By now, the process should be familiar. - -``` r -pbp <- readRDS('cal_data.rds') - -model_data <- pbp %>% - filter(season >= 2006) %>% - make_model_mutations() %>% - dplyr::mutate(receiver_player_name = - stringr::str_extract(desc, "(?<=((to)|(for))\\s[:digit:]{0,2}\\-{0,1})[A-Z][A-z]*\\.\\s?[A-Z][A-z]+(\\s(I{2,3})|(IV))?"), - pass_middle = dplyr::if_else(pass_location == 'middle', 1, 0), - air_is_zero= dplyr::if_else(air_yards == 0,1,0), - distance_to_sticks = air_yards - ydstogo - ) %>% - dplyr::filter(complete_pass == 1 | incomplete_pass == 1 | interception == 1) %>% - dplyr::filter(!is.na(air_yards) & air_yards >= -15 & air_yards <70 & !is.na(receiver_player_name) & !is.na(pass_location)) %>% - dplyr::select( - season, complete_pass, air_yards, yardline_100, ydstogo, - down1, down2, down3, down4, air_is_zero, pass_middle, - era2, era3, era4, qb_hit, home, model_week, - outdoors, retractable, dome, distance_to_sticks - ) -rm(pbp) - - -nrounds = 70 -params <- - list( - booster = "gbtree", - objective = "binary:logistic", - eval_metric = c("logloss"), - eta = 0.2, - gamma = 5, - subsample=0.8, - colsample_bytree=0.8, - max_depth = 4, - min_child_weight = 6, - base_score = mean(model_data$complete_pass) - ) - -cv_results <- map_dfr(2006:2019, function(x) { - - test_data <- model_data %>% - filter(season == x) %>% - select(-season) - train_data <- model_data %>% - filter(season != x) %>% - select(-season) - - full_train = xgboost::xgb.DMatrix(model.matrix(~.+0, data = train_data %>% select(-complete_pass)), - label = train_data$complete_pass) - cp_model <- xgboost::xgboost(params = params, data = full_train, nrounds = nrounds, verbose = 2) - - preds <- as.data.frame( - matrix(predict(cp_model, as.matrix(test_data %>% select(-complete_pass)))) - ) %>% - dplyr::rename(cp = V1) - - cv_data <- bind_cols(test_data, preds) %>% mutate(season = x) - return(cv_data) - -}) - -#TIME FOR BINNING -cp_cv_loso_calibration_results <- cv_results %>% - # Create BINS for wp: - mutate( - bin_pred_prob = round(cp / 0.05) * .05, - distance = case_when( - air_yards < 5 ~ "Short", - air_yards >= 5 & air_yards < 15 ~ "Intermediate", - air_yards >= 15 ~ "Deep" - ) - ) %>% - # Group by both the qtr and bin_pred_prob: - group_by(distance, bin_pred_prob) %>% - # Calculate the calibration results: - summarize(n_plays = n(), - n_complete = length(which(complete_pass == 1)), - bin_actual_prob = n_complete / n_plays) - -ann_text <- data.frame(x = c(.25, 0.75), y = c(0.75, 0.25), - lab = c("More times\nthan expected", "Fewer times\nthan expected") - ) -``` - -Plot the results: - -``` r -cp_cv_loso_calibration_results %>% - ungroup() %>% - mutate(distance = fct_relevel(distance, - "Short", "Intermediate", "Deep") - ) %>% - filter(n_plays > 10) %>% - ggplot() + - geom_point(aes(x = bin_pred_prob, y = bin_actual_prob, size = n_plays)) + - geom_smooth(aes(x = bin_pred_prob, y = bin_actual_prob), method = "loess") + - geom_abline(slope = 1, intercept = 0, color = "black", lty = 2) + - coord_equal() + - scale_x_continuous(limits = c(0,1)) + - scale_y_continuous(limits = c(0,1)) + - labs(size = "Number of plays", - x = "Estimated completion percentage", - y = "Observed completion percentage") + - geom_text(data = ann_text, aes(x = x, y = y, label = lab), size = 3) + - theme_bw() + - theme(plot.title = element_text(hjust = 0.5), - strip.background = element_blank(), - strip.text = element_text(size = 12), - axis.title = element_text(size = 12), - axis.text.y = element_text(size = 12), - axis.text.x = element_text(size = 10, angle = 90), - legend.title = element_text(size = 12), - legend.text = element_text(size = 12), - legend.position = "bottom") + - facet_wrap(~ distance, ncol = 3) -``` - - - -And get the calibration error: - -``` r -cp_cv_cal_error <- cp_cv_loso_calibration_results %>% - ungroup() %>% - mutate(cal_diff = abs(bin_pred_prob - bin_actual_prob)) %>% - group_by(distance) %>% - summarize(weight_cal_error = weighted.mean(cal_diff, n_plays, na.rm = TRUE), - n_complete = sum(n_complete, na.rm = TRUE)) - -round(with(cp_cv_cal_error, weighted.mean(weight_cal_error, n_complete)), 4) -#> [1] 0.0059 -``` diff --git a/data-raw/MODELS_DATA.R b/data-raw/MODELS_DATA.R deleted file mode 100644 index 710b97c2..00000000 --- a/data-raw/MODELS_DATA.R +++ /dev/null @@ -1,154 +0,0 @@ -################################################################################ -# Author: Ben Baldwin -# Purpose: Prepare data for nflfastR models for EP, CP, Field Goals, and WP -# This takes a long time (especially finding next score half) -# Save a pre-prepared df -################################################################################ - -library(tidyverse) -source('data-raw/EP_functions.R') - -################################################################################ -# DATA PREP -################################################################################ - -#read in data from data repo -pbp_data <- purrr::map_df(1999 : 2019, function(x) { - readRDS( - url( - glue::glue("https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_{x}.rds") - ) - ) %>% filter(season_type == 'REG') -}) %>% - mutate( - Winner = if_else(home_score > away_score, home_team, - if_else(home_score < away_score, away_team, "TIE")) - ) - -#get next score half using the provided function -pbp_next_score_half <- map_dfr(unique(pbp_data$game_id), - function(x) { - pbp_data %>% - filter(game_id == x) %>% - find_game_next_score_half() - }) - -#bind to original df -pbp_data <- bind_cols(pbp_data, pbp_next_score_half) - -#for estimating the models, apply some filters -pbp_data <- pbp_data %>% - filter(Next_Score_Half %in% c("Opp_Field_Goal", "Opp_Safety", "Opp_Touchdown", - "Field_Goal", "No_Score", "Safety", "Touchdown") & - play_type %in% c("field_goal", "no_play", "pass", "punt", "run", - "qb_spike") & is.na(two_point_conv_result) & is.na(extra_point_result) & - !is.na(down) & !is.na(game_seconds_remaining)) %>% - #to keep file size manageable - select( - game_id, - Next_Score_Half, - Drive_Score_Half, - play_type, - game_seconds_remaining, - half_seconds_remaining, - yardline_100, - roof, - posteam, - defteam, - home_team, - ydstogo, - season, - qtr, - down, - week, - drive, - ep, - score_differential, - posteam_timeouts_remaining, - defteam_timeouts_remaining, - desc, - receiver_player_name, - pass_location, - air_yards, - complete_pass, incomplete_pass, interception, - qb_hit, - extra_point_result, - field_goal_result, - sp, - Winner, - spread_line - ) - -#for doing calibation etc -saveRDS(pbp_data, 'data-raw/cal_data.rds') - -#fix roof types -#now fixing spread_line -#delete this after re-scraping and updating -games_data <- readRDS(url("https://github.com/leesharpe/nfldata/blob/master/data/games.rds?raw=true")) %>% - select(home_team, season, week, spread_line) %>% - mutate( - home_team = case_when( - home_team == 'STL' ~ 'LA', - home_team == 'SD' ~ 'LAC', - home_team == 'OAK' ~ 'LV', - TRUE ~ home_team - ) - ) - -pbp_data <- readRDS('data-raw/cal_data.rds') -pbp_data <- pbp_data %>% - left_join(games_data, by = c('home_team', 'season', 'week')) -saveRDS(pbp_data, 'data-raw/cal_data.rds') - - -################################################################################ -# DATA PREP FOR NFLSCRAPR COMPARISON -# This is only used in the readme describing the models -# Where we compare nflscrapR and nflfastR calibration errors -################################################################################ - -pbp_data <- purrr::map_df(2000 : 2019, function(x) { - readRDS( - url( - glue::glue("https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/legacy-data/play_by_play_{x}.rds") - ) - ) %>% filter(season_type == 'REG') -}) - -games <- readRDS(url("http://www.habitatring.com/games.rds")) %>% - filter(!is.na(result)) %>% - mutate( - game_id = as.numeric(old_game_id), - Winner = if_else(home_score > away_score, home_team, - if_else(home_score < away_score, away_team, "TIE")) - ) %>% - select(game_id, Winner, result, roof) - -pbp_data <- pbp_data %>% - left_join( - games, by = c('game_id') - ) - -#get next score half using the provided function -pbp_next_score_half <- map_dfr(unique(pbp_data$game_id), - function(x) { - pbp_data %>% - filter(game_id == x) %>% - find_game_next_score_half() - }) - -#bind to original df -pbp_data <- bind_cols(pbp_data, pbp_next_score_half) - -#apply filters -pbp_data <- pbp_data %>% - filter(Next_Score_Half %in% c("Opp_Field_Goal", "Opp_Safety", "Opp_Touchdown", - "Field_Goal", "No_Score", "Safety", "Touchdown") & - play_type %in% c("field_goal", "no_play", "pass", "punt", "run", - "qb_spike") & is.na(two_point_conv_result) & is.na(extra_point_result) & - !is.na(down) & !is.na(game_seconds_remaining)) %>% - select(posteam, wp, qtr, Winner, td_prob, opp_td_prob, fg_prob, opp_fg_prob, safety_prob, opp_safety_prob, no_score_prob, Next_Score_Half) - -#for doing calibation etc -saveRDS(pbp_data, 'data-raw/cal_data_nflscrapr.rds') diff --git a/data-raw/man/figures/README-plot-1.png b/data-raw/man/figures/README-plot-1.png deleted file mode 100644 index efcee8bc..00000000 Binary files a/data-raw/man/figures/README-plot-1.png and /dev/null differ diff --git a/data-raw/man/figures/README-plot_cp-1.png b/data-raw/man/figures/README-plot_cp-1.png deleted file mode 100644 index 1ccba2d4..00000000 Binary files a/data-raw/man/figures/README-plot_cp-1.png and /dev/null differ diff --git a/data-raw/man/figures/README-plot_wp-1.png b/data-raw/man/figures/README-plot_wp-1.png deleted file mode 100644 index 943ebe74..00000000 Binary files a/data-raw/man/figures/README-plot_wp-1.png and /dev/null differ diff --git a/data-raw/man/figures/README-plot_wp_spread-1.png b/data-raw/man/figures/README-plot_wp_spread-1.png deleted file mode 100644 index ebdf9402..00000000 Binary files a/data-raw/man/figures/README-plot_wp_spread-1.png and /dev/null differ diff --git a/data-raw/package_building_workflow.R b/data-raw/package_building_workflow.R deleted file mode 100644 index c80e3885..00000000 --- a/data-raw/package_building_workflow.R +++ /dev/null @@ -1,32 +0,0 @@ -# Package workflow -# library(devtools) - -# add all package dependencies to DESCRIPTION -# alphabetical order -usethis::use_package("dplyr", type = "Imports", min_version = NULL) -usethis::use_package("furrr", type = "Suggests", min_version = NULL) -usethis::use_package("future", type = "Suggests", min_version = NULL) -usethis::use_package("glue", type = "Imports", min_version = NULL) -usethis::use_package("httr", type = "Imports", min_version = NULL) -usethis::use_package("janitor", type = "Imports", min_version = NULL) -usethis::use_package("jsonlite", type = "Imports", min_version = NULL) -usethis::use_package("lubridate", type = "Imports", min_version = NULL) -usethis::use_package("magrittr", type = "Imports", min_version = NULL) -usethis::use_package("mgcv", type = "Imports", min_version = NULL) -usethis::use_package("purrr", type = "Imports", min_version = NULL) -usethis::use_package("progressr", type = "Imports", min_version = NULL) -usethis::use_package("stringr", type = "Imports", min_version = NULL) -usethis::use_package("tibble", type = "Imports", min_version = NULL) -usethis::use_package("tidyr", type = "Imports", min_version = NULL) -usethis::use_package("tidyselect", type = "Imports", min_version = NULL) -usethis::use_tidy_description() - -# add license -usethis::use_mit_license("Sebastian Carl; Ben Baldwin") - -# change version number to dev version -usethis::use_dev_version() - -# change version number to release version -# run this after every update and choose interactively the level of increment -usethis::use_version() diff --git a/man/figures/README-ex7-1.png b/man/figures/README-ex7-1.png deleted file mode 100644 index 5caa9d2e..00000000 Binary files a/man/figures/README-ex7-1.png and /dev/null differ