# Load the Coach object from a pickle file
import os
import pygskin

path_to_coach = os.path.join("data", "coaches", "Kalen_DeBoer.coach")
coach = pygskin.Coach.unpickle(path_to_coach)
print(f"Loaded Kalen DeBoer's file, containing data from {coach.first_year} to {coach.last_year}.")

Loaded Kalen DeBoer's file, containing data from 2020 to 2023.

import pandas as pd

# begin DataFrame operations to clean up play data
play_df = [game.play_df for game in coach.games_list]
# make a single DataFrame from all play DataFrames
play_df = pd.concat(play_df, axis=0)   # will be deprecated in future versions of pandas
print(f"{len(play_df)} plays in all categories.")

4075 plays in all categories.

# Make a single column for the remaining time
play_df["seconds_remaining"] = play_df["clock"].apply(lambda x: x["minutes"] * 60 + x["seconds"])

# Make a single column for the score difference
play_df["score_diff"] = play_df["offense_score"] - play_df["defense_score"]

# Yards gained per pass attempt
# Calculated by dividing yards_gained on passing plays by the number of pass attempts. Rolling average that resets at the start of each drive.
passing_yards = 0
passing_attempts = 0
prev_drive_id = -1
# reset index
play_df = play_df.reset_index(drop=True)
for index, row in play_df.iterrows():
    if row["drive_id"] != prev_drive_id:
        passing_yards = 0
        passing_attempts = 0
        prev_drive_id = row["drive_id"]
    if pygskin.get_play_type(row["play_type"]) == 0:
        passing_yards += row["yards_gained"]
        passing_attempts += 1
    try:
        play_df.at[index, "passing_yards_per_attempt"] = passing_yards / passing_attempts
    except ZeroDivisionError:
        play_df.at[index, "passing_yards_per_attempt"] = 0

# Yards gained per rush attempt
# Calculated by dividing yards_gained on rushing plays by the number of rush attempts. Rolling average that resets at the start of each drive.
rushing_yards = 0
rushing_attempts = 0
prev_drive_id = -1
# reset index
play_df = play_df.reset_index(drop=True)
for index, row in play_df.iterrows():
    if row["drive_id"] != prev_drive_id:
        rushing_yards = 0
        rushing_attempts = 0
        prev_drive_id = row["drive_id"]
    if pygskin.get_play_type(row["play_type"]) == 1:
        rushing_yards += row["yards_gained"]
        rushing_attempts += 1
    try:
        play_df.at[index, "rushing_yards_per_attempt"] = rushing_yards / rushing_attempts
    except ZeroDivisionError:
        play_df.at[index, "rushing_yards_per_attempt"] = 0

# Encode play_type as an integer
play_df["play_call"] = play_df["play_type"].apply(lambda x: pygskin.get_play_type(x))
play_df = play_df.dropna(subset=["play_call"])
play_df["play_call"] = play_df["play_call"].astype(int)
# drop rows with play_call == 5
play_df = play_df[play_df["play_call"] != 5]
print(f"{len(play_df)} plays to analyze before dropping rows with NaN values.")
print(play_df.columns)

3230 plays to analyze before dropping rows with NaN values.
Index(['id', 'drive_id', 'game_id', 'drive_number', 'play_number', 'offense',
       'offense_conference', 'offense_score', 'defense', 'home', 'away',
       'defense_conference', 'defense_score', 'period', 'clock',
       'offense_timeouts', 'defense_timeouts', 'yard_line', 'yards_to_goal',
       'down', 'distance', 'yards_gained', 'scoring', 'play_type', 'play_text',
       'ppa', 'wallclock', 'week', 'season', 'seconds_remaining', 'score_diff',
       'passing_yards_per_attempt', 'rushing_yards_per_attempt', 'play_call'],
      dtype='object')

import cfbd
from dotenv import load_dotenv

configuration = cfbd.Configuration()
load_dotenv()
configuration.api_key["Authorization"] = os.getenv("API_KEY")
configuration.api_key_prefix["Authorization"] = "Bearer"

stats_api = cfbd.StatsApi(cfbd.ApiClient(configuration))

"""
Use these stats:
fieldGoalPct
fourthDownEff
thirdDownEff
"""
# Get stats for each game
all_game_stats = []
for game in coach.games_list:
    game_id = game.game_dict["id"]
    game_stats = stats_api.get_team_season_stats(year=game.game_dict["season"], team=coach.coach_school_dict[game.game_dict["season"]], start_week=1, end_week=max(1, game.game_dict["week"]-1))
    for stat in game_stats:
        stat = stat.to_dict()
        field_goal_pct = 100
        fourth_down_eff = 100
        third_down_eff = 100
        if stat["stat_name"] == "fieldGoalPct":
            field_goal_pct = stat["stat_value"]
        elif stat["stat_name"] == "fourthDownEff":
            fourth_down_eff = stat["stat_value"]
        elif stat["stat_name"] == "thirdDownEff":
            third_down_eff = stat["stat_value"]
    all_game_stats.append({
        "game_id": game_id,
        "field_goal_pct": field_goal_pct,
        "fourth_down_eff": fourth_down_eff,
        "third_down_eff": third_down_eff
    })
game_df = pd.DataFrame(all_game_stats, columns=["game_id", "field_goal_pct", "fourth_down_eff", "third_down_eff"])
# add stats to play_df
play_df = play_df.merge(game_df, on="game_id")

from sklearn.model_selection import train_test_split

# drop columns with meta values
play_df = play_df.drop(["id", "drive_id", "game_id"], axis=1)
# drop columns that are not useful
play_df = play_df.drop(["play_type", "home", "away", "season", "offense", "defense", "offense_conference", "defense_conference"], axis=1)
# drop columns that reveal information that directly/indirectly reveals the play call
play_df = play_df.drop(["yards_gained", "play_text", "wallclock", "scoring", "ppa"], axis=1)
# drop columns that may or may not be useful
play_df = play_df.drop(["week", "drive_number"], axis=1)
# drop columns that were used to engineer new columns
play_df = play_df.drop(["offense_score", "defense_score", "clock"], axis=1)

# Drop rows with NaN values
play_df = play_df.dropna()
print(f"{len(play_df)} plays to analyze after dropping rows with NaN values.")

# print remaining columns
print(play_df.columns)

# create X and y
X = play_df.drop(["play_call"], axis=1)
y = play_df["play_call"]

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

3230 plays to analyze after dropping rows with NaN values.
Index(['play_number', 'period', 'offense_timeouts', 'defense_timeouts',
       'yard_line', 'yards_to_goal', 'down', 'distance', 'seconds_remaining',
       'score_diff', 'passing_yards_per_attempt', 'rushing_yards_per_attempt',
       'play_call', 'field_goal_pct', 'fourth_down_eff', 'third_down_eff'],
      dtype='object')

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# set up a tuner to find the best parameters
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [2, 4, 6, 8, 10, 12, None],
}
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_tuner = GridSearchCV(decision_tree_model, param_grid, cv=5, scoring="accuracy", n_jobs=-1, refit=True)
decision_tree_tuner.fit(X_train, y_train)

# print the best parameters
print(f"Best parameters: {decision_tree_tuner.best_params_}")

# make predictions
decision_tree_pred = decision_tree_tuner.predict(X_test)

# print the accuracy score
print(f"Accuracy score: {decision_tree_tuner.score(X_test, y_test)}")

Best parameters: {'criterion': 'entropy', 'max_depth': 6}
Accuracy score: 0.7492260061919505

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

# perform dimensionality reduction using SVG
svd = TruncatedSVD(n_components=len(X.columns)-1)
X_train = svd.fit_transform(X_train)
X_test = svd.transform(X_test)

# scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# set up a tuner to find the best parameters
param_grid = {
    "n_estimators": [600],
    "max_depth": [10, 12, 16, None],
    "criterion": ["gini"],
}
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_tuner = GridSearchCV(random_forest_model, param_grid, cv=5, scoring="accuracy", n_jobs=-1, refit=True)
random_forest_tuner.fit(X_train, y_train)

# make predictions
random_forest_pred = random_forest_tuner.predict(X_test)

# print best parameters
print(random_forest_tuner.best_params_)

# print accuracy
print(f"Accuracy: {random_forest_tuner.score(X_test, y_test)}")

{'criterion': 'gini', 'max_depth': 16, 'n_estimators': 600}
Accuracy: 0.7291021671826625

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# set up a tuner to find the best k value
param_grid = {
    "n_neighbors": [50, 60, 70, 80, 90, 100, 200, 300],
    "leaf_size": [25, 50, 100],
    "p": [1, 2],
    "algorithm": ["ball_tree", "kd_tree", "brute"],
}
knn_model = KNeighborsClassifier()
knn_tuner = GridSearchCV(knn_model, param_grid, cv=5, scoring="accuracy", n_jobs=-1, refit=True)
knn_tuner.fit(X_train, y_train)

# print best parameters
print(knn_tuner.best_params_)

# make predictions
knn_pred = knn_tuner.predict(X_test)

# print accuracy
print(f"Accuracy: {knn_tuner.score(X_test, y_test)}")

{'algorithm': 'ball_tree', 'leaf_size': 25, 'n_neighbors': 50, 'p': 1}
Accuracy: 0.6439628482972136

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# set up a tuner to find the best parameters
param_grid = {
    "penalty": ["l2",],
    "C": [0.1, 1, 10, 100],
    "max_iter": [2000],
    "solver": ["newton-cg", "saga",],
}
logistic_regression_model = LogisticRegression(random_state=42)
logistic_regression_tuner = GridSearchCV(logistic_regression_model, param_grid, cv=5, scoring="accuracy", n_jobs=-1, refit=True)
logistic_regression_tuner.fit(X_train, y_train)

# print best parameters
print(logistic_regression_tuner.best_params_)

# make predictions
logistic_regression_pred = logistic_regression_tuner.predict(X_test)

# print accuracy
print(f"Accuracy: {logistic_regression_tuner.score(X_test, y_test)}")

{'C': 10, 'max_iter': 2000, 'penalty': 'l2', 'solver': 'newton-cg'}
Accuracy: 0.6749226006191951

from sklearn.neural_network import MLPClassifier

# create and fit the neural network model
mlp_model = MLPClassifier(hidden_layer_sizes=(250,), activation='logistic', solver='adam', random_state=42)
mlp_model.fit(X_train, y_train)

# make predictions
mlp_pred = mlp_model.predict(X_test)

# print accuracy
print(f"Accuracy: {mlp_model.score(X_test, y_test)}")

Accuracy: 0.7167182662538699

c:\Users\Scott\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

# define the parameter grid
param_grid = {
    "hidden_layer_sizes": [(100,), (250,), (500,), (750,), (1000,)],
    "activation": ["tanh", "relu", "logistic"],
    'solver': ["sgd", "adam"],
}

# create the MLPClassifier model
mlp_hp_model = MLPClassifier(random_state=42, max_iter=2000)

# create the GridSearchCV tuner
mlp_tuner = GridSearchCV(mlp_hp_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# fit the tuner to the training data
mlp_tuner.fit(X_train, y_train)

# print the best hyperparameters
print(f"Best hyperparameters: {mlp_tuner.best_params_}")

# make predictions
mlp_hp_pred = mlp_tuner.predict(X_test)

# print accuracy
print(f"Accuracy: {mlp_tuner.score(X_test, y_test)}")

Best hyperparameters: {'activation': 'tanh', 'hidden_layer_sizes': (500,), 'solver': 'sgd'}
Accuracy: 0.718266253869969

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# print accuracy
print(f"Accuracy: {accuracy_score(y_test, decision_tree_pred)}")

# print confusion matrix with class labels
print("Confusion matrix (predicted/actual):")
print(pd.DataFrame(confusion_matrix(y_test, decision_tree_pred), index=["Pass", "Run", "Field Goal", "Punt"], columns=["Pass", "Run", "Field Goal", "Punt"]))

# print feature importances with their corresponding columns
print("\nFeature importances:")
for feature, importance in zip(X.columns, decision_tree_tuner.best_estimator_.feature_importances_):
    print(f"{feature}: {round(importance * 100, 2)}%")

Accuracy: 0.7492260061919505
Confusion matrix (predicted/actual):
            Pass  Run  Field Goal  Punt
Pass         313   48           4     1
Run          101  142           3     0
Field Goal     3    0          11     0
Punt           1    1           0    18

Feature importances:
play_number: 2.25%
period: 1.09%
offense_timeouts: 0.48%
defense_timeouts: 0.0%
yard_line: 0.96%
yards_to_goal: 10.63%
down: 39.31%
distance: 8.59%
seconds_remaining: 2.12%
score_diff: 1.97%
passing_yards_per_attempt: 12.55%
rushing_yards_per_attempt: 20.05%
field_goal_pct: 0.0%
fourth_down_eff: 0.0%
third_down_eff: 0.0%

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# print accuracy
print(f"Accuracy: {accuracy_score(y_test, random_forest_pred)}")

# print confusion matrix with class labels
print("Confusion matrix (predicted/actual):")
print(pd.DataFrame(confusion_matrix(y_test, random_forest_pred), index=["Pass", "Run", "Field Goal", "Punt"], columns=["Pass", "Run", "Field Goal", "Punt"]))

# print feature importances with their corresponding columns
print("\nFeature importances:")
for feature, importance in zip(X.columns, random_forest_tuner.best_estimator_.feature_importances_):
    print(f"{feature}: {round(importance * 100, 2)}%")

Accuracy: 0.7291021671826625
Confusion matrix (predicted/actual):
            Pass  Run  Field Goal  Punt
Pass         306   56           1     3
Run           97  149           0     0
Field Goal     9    1           2     2
Punt           6    0           0    14

Feature importances:
play_number: 4.91%
period: 4.97%
offense_timeouts: 8.49%
defense_timeouts: 6.27%
yard_line: 5.64%
yards_to_goal: 12.87%
down: 10.51%
distance: 9.29%
seconds_remaining: 8.72%
score_diff: 5.6%
passing_yards_per_attempt: 4.87%
rushing_yards_per_attempt: 13.08%
field_goal_pct: 4.77%
fourth_down_eff: 0.0%

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# print accuracy
print(f"Accuracy: {accuracy_score(y_test, knn_pred)}")

# print confusion matrix with class labels
print("Confusion matrix (predicted/actual):")
print(pd.DataFrame(confusion_matrix(y_test, knn_pred), index=["Pass", "Run", "Field Goal", "Punt"], columns=["Pass", "Run", "Field Goal", "Punt"]))

Accuracy: 0.6439628482972136
Confusion matrix (predicted/actual):
            Pass  Run  Field Goal  Punt
Pass         323   43           0     0
Run          155   91           0     0
Field Goal    14    0           0     0
Punt          14    4           0     2

# print accuracy
print(f"Accuracy: {accuracy_score(y_test, mlp_pred)}")

# print confusion matrix with class labels
print("Confusion matrix (predicted/actual):")
print(pd.DataFrame(confusion_matrix(y_test, mlp_pred), index=["Pass", "Run", "Field Goal", "Punt"], columns=["Pass", "Run", "Field Goal", "Punt"]))

Accuracy: 0.7167182662538699
Confusion matrix (predicted/actual):
            Pass  Run  Field Goal  Punt
Pass         297   63           3     3
Run          105  138           3     0
Field Goal     4    0           8     2
Punt           0    0           0    20

# print accuracy
print(f"Accuracy: {accuracy_score(y_test, mlp_hp_pred)}")

# print confusion matrix with class labels
print("Confusion matrix (predicted/actual):")
print(pd.DataFrame(confusion_matrix(y_test, mlp_hp_pred), index=["Pass", "Run", "Field Goal", "Punt"], columns=["Pass", "Run", "Field Goal", "Punt"]))

Accuracy: 0.718266253869969
Confusion matrix (predicted/actual):
            Pass  Run  Field Goal  Punt
Pass         290   70           2     4
Run           97  144           4     1
Field Goal     2    1          10     1
Punt           0    0           0    20

Kalen DeBoer Playcalling Analysis (2020-2023)¶

Preprocess Data¶

Engineer New Columns¶

Add Team Stats to Data¶

Drop Columns and Split Data into Test/Train¶

Predict Play Calls¶

Decision Tree Classifier¶

Random Forest Classifier¶

K-Nearest Neighbors Clustering¶

Logistic Regression¶

Multilevel Perceptron Neural Network¶

Multilevel Perceptron Neural Network (Hyperparameter Search)¶

Model Comparison¶

Decision Tree Classifier¶

Random Forest Classifier¶

K-Nearest Neighbors Clustering¶

Neural Network¶

Neural Network (Hyperparameter Search)¶

Results¶