Set Up¶
Imports, Global Variables, and Constants¶
import os
import pickle
import numpy as np
import pandas as pd
import pygskin
# constants for saving and loading data
year = 2023
path_to_files = os.path.join(os.getcwd() + os.sep + "data")
# calculation constants
RANDOM_STATE = 42
# plot constants and variables
USA_BOUNDS = [-121, -75, 23, 50]
POINT_SIZE = 20
POINT_ZORDER = 2
LINE_ZORDER = 1.5
conference_colors = ["red", "blue", "green", "orange", "purple", "brown", "pink", "lime", "cyan", "magenta", "yellow", "gray", "olive", "maroon", "navy", "teal", "gold", "darkorange", "darkgreen", "darkred", "darkblue", "darkgray", "darkcyan", "darkmagenta", "darkkhaki", "darkgoldenrod", "darkslategray", "darkolivegreen", "darkseagreen", "darkslateblue", "darkturquoise", "darkviolet", "deeppink", "deepskyblue", "dimgray", "dodgerblue", "firebrick", "forestgreen", "fuchsia", "gainsboro", "ghostwhite", "goldenrod", "greenyellow", "hotpink", "indianred"]
Load Season Data¶
If season data is present in .cfb (pickled) form in the data/year folder, load it. Otherwise, use the CFBD API to download, save, and load it.
If the class definition of a Team, Season, or SeasonAnalyzer is altered, the loaded information will not function as intended. The first troubleshooting step is to delete your data folder and try again.
if os.path.exists(path_to_files + os.sep + str(year) + os.sep + f"season_{year}.cfb"):
print(f"Loading {year} season from file...")
with open(path_to_files + os.sep + str(year) + os.sep + f"season_{year}.cfb", "rb") as f:
season = pickle.load(f)
print(f"Loaded {year} season from file.")
else:
print(f"Loading {year} season from API...")
season = pygskin.Season.from_cfbd_api(year)
print(f"Loaded {year} season from API.")
# create directory for season, if it doesn't exist
if not os.path.exists(path_to_files + os.sep + str(year)):
os.mkdir(path_to_files + os.sep + str(year))
# pickle season
with open(path_to_files + os.sep + str(year) + os.sep + f"season_{year}.cfb", "wb") as f:
pickle.dump(season, f)
# pickle all Teams in Season
for school in season.teams.keys():
with open(path_to_files + os.sep + str(year) + os.sep + f"team_{school}.cfb", "wb") as f:
pickle.dump(season.teams[school], f)
print(f"Saved {year} season to file.")
analysis = pygskin.SeasonAnalyzer(season)
Loading 2023 season from file... Loaded 2023 season from file.
Data Visualization¶
This map shows the actual locations of all FBS schools in the continental United States. Hawai'i is the only school not visible with the default bounds, as it would greatly skew the scale of the map.
import cartopy.mpl.geoaxes
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
# Create a figure with an axes object on which we will plot. Pass the projection to that axes.
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection=ccrs.LambertConformal(central_longitude=-100, central_latitude=45)))
ax: cartopy.mpl.geoaxes.GeoAxes
# Set the extent of the map to the contiguous United States
ax.set_extent(USA_BOUNDS)
# Add state boundaries to plot
ax.add_feature(cfeature.STATES)
# Add country borders to plot
ax.add_feature(cfeature.BORDERS)
# Add ocean to plot
ax.add_feature(cfeature.OCEAN)
# Add lakes to plot
ax.add_feature(cfeature.LAKES)
# Add land to plot
ax.add_feature(cfeature.LAND)
# Add coastlines to plot
ax.coastlines()
# Add the locations of the schools to the plot with a different color for each conference
for conference in analysis.school_locations["conference"].unique():
ax.scatter(analysis.school_locations[analysis.school_locations["conference"] == conference]["longitude"], analysis.school_locations[analysis.school_locations["conference"] == conference]["latitude"], color=conference_colors[conference], s=20, transform=ccrs.Geodetic(), linewidth=0.5, edgecolor="black", zorder=2)
# Add a circle around each conference with a radius of the max distance between the centroid and the schools in the conference
for conference in analysis.school_locations["conference"].unique():
conf_schools = analysis.school_locations[analysis.school_locations["conference"] == conference]
centroid = np.average(conf_schools[["longitude", "latitude"]], axis=0)
max_distance = max([np.linalg.norm(np.array([school.longitude, school.latitude]) - centroid) for school in conf_schools.itertuples()])
# ax.add_patch(plt.Circle((centroid[0], centroid[1]), max_distance, transform=ccrs.Geodetic(), fill=False, color=conference_colors[conference]))
# Add a title to the plot
ax.set_title(f"{analysis.season.year} FBS Schools")
# create a legend with the names of the conferences
legend_elements = [plt.Line2D([0], [0], marker="o", color="w", label=analysis.conferences[conference], markerfacecolor=conference_colors[conference], markersize=10) for conference in analysis.school_locations["conference"].unique()]
plt.show()
# Show the legend as a output from the plot
handles, labels = ax.get_legend_handles_labels()
fig_legend = plt.figure(figsize=(2,2))
axi = fig_legend.add_subplot(111)
fig_legend.legend(handles=legend_elements, loc="center", title="Conference")
axi.xaxis.set_visible(False)
axi.yaxis.set_visible(False)
# remove bounding box
axi.spines['top'].set_visible(False)
axi.spines['bottom'].set_visible(False)
axi.spines['left'].set_visible(False)
axi.spines['right'].set_visible(False)
axi.set_frame_on(False)
K-Nearest Neighbors Model¶
Use the k-nearest neighbors algorithm to predict teams' conference based exclusively on longitude and latitude.
Due to the way that team information is reported by the CFBD API, it is necessary to drop all schools with a conference value of "FBS Independents". These schools do not actually belong to a conference, so they should not be included in the training data.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
print("Training KNN model...")
# Preprocess the data. Drop school and conference columns, use conference as target
df = analysis.school_locations.copy()
# remove schools with the conference name "FBS Independents"
df = df[df["conference"] != analysis.conferences.index("FBS Independents")]
X = df.drop(columns=["school", "conference"])
y = df["conference"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
knn = KNeighborsClassifier(n_neighbors=5)
# train the model
knn.fit(X_train, y_train)
# X has columns longitude and latitude
# y has column conference
# test the model
y_pred = knn.predict(X_test)
# print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
knn_df = pd.DataFrame(columns=["school", "longitude", "latitude", "conference", "predicted", "correct"])
# use the coordinates from the test set to find the school name
knn_df["school"] = df[df["longitude"].isin(X_test["longitude"]) & df["latitude"].isin(X_test["latitude"])]["school"]
knn_df["longitude"] = X_test["longitude"]
knn_df["latitude"] = X_test["latitude"]
knn_df["conference"] = y_test
knn_df["predicted"] = y_pred
knn_df["correct"] = knn_df["conference"] == knn_df["predicted"]
printable_knn_df = knn_df.drop(columns=["longitude", "latitude"])
printable_knn_df["conference"] = [analysis.conferences[conference] for conference in printable_knn_df["conference"]]
printable_knn_df["predicted"] = [analysis.conferences[conference] for conference in printable_knn_df["predicted"]]
print(printable_knn_df)
Training KNN model... Accuracy: 0.23076923076923078 school conference predicted correct 4 Arizona Pac-12 American Athletic False 11 Baylor Big 12 Big Ten False 12 Boise State Mountain West Mid-American False 19 Charlotte American Athletic Pac-12 False 20 Cincinnati Big 12 Mountain West False 28 Eastern Michigan Mid-American SEC False 29 Florida SEC Conference USA False 33 Fresno State Mountain West Big Ten False 38 Hawai'i Mountain West Pac-12 False 42 Iowa Big Ten Mid-American False 47 Kansas State Big 12 ACC False 57 Maryland Big Ten Sun Belt False 58 Memphis American Athletic Big 12 False 66 Missouri SEC Conference USA False 71 New Mexico Mountain West American Athletic False 83 Ole Miss SEC ACC False 84 Oregon Pac-12 Pac-12 True 87 Pittsburgh ACC SEC False 96 South Carolina SEC Sun Belt False 98 South Florida American Athletic Mid-American False 100 Syracuse ACC ACC True 103 Tennessee SEC American Athletic False 108 Toledo Mid-American ACC False 118 Utah Pac-12 Pac-12 True 123 Virginia ACC SEC False 128 Western Kentucky Conference USA Big 12 False
Actual vs. Predicted Clusters¶
Evaluate the results of the k-nearest neighbors algorithm. Plot the points with a correctly predicted conference in green and incorrectly predicted in red. Output the results.
# Create a figure with an axes object on which we will plot. Pass the projection to that axes.
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection=ccrs.LambertConformal(central_longitude=-100, central_latitude=45)))
ax: cartopy.mpl.geoaxes.GeoAxes
# Set the extent of the map to the contiguous United States
ax.set_extent(USA_BOUNDS)
# Add state boundaries to plot
ax.add_feature(cfeature.STATES)
# Add country borders to plot
ax.add_feature(cfeature.BORDERS)
# Add ocean to plot
ax.add_feature(cfeature.OCEAN)
# Add lakes to plot
ax.add_feature(cfeature.LAKES)
# Add land to plot
ax.add_feature(cfeature.LAND)
# Add coastlines to plot
ax.coastlines()
# Plot each school with a color corresponding to the success of the prediction
for index, row in knn_df.iterrows():
# add a label to the plot for the correctly predicted schools
ax.scatter(row["longitude"], row["latitude"], transform=ccrs.Geodetic(), s=POINT_SIZE, color="green" if row["correct"] else "red", linewidth=0.5, edgecolor="black", zorder=2)
# Add text to the plot
for index, school in knn_df.iterrows():
ax.text(school["longitude"], school["latitude"], analysis.conferences[school["conference"]], transform=ccrs.Geodetic(), horizontalalignment="left", verticalalignment="bottom")
# Add a legend to the plot
ax.legend(handles=[plt.Line2D([0], [0], color="green", lw=4), plt.Line2D([0], [0], color="red", lw=4)], labels=["Correct", "Incorrect"])
# Add a title to the plot
ax.set_title(f"{analysis.season.year} FBS Conference Members - K-Nearest Neighbors Predictions")
plt.show()
K-Means Clustering¶
In the year 2023, there are 11 conferences in the FBS. The k-means algorithm uses a k value equal to the number of conferences in the FBS that season, not counting FBS Independents. Because FBS conferences typically have 8 to 18 members, this is the range of acceptable points per cluster.
from sklearn.cluster import KMeans
k_means_df = analysis.school_locations.copy()
X = k_means_df.drop(columns=["school", "conference"])
allowed_cluster_size = range(8, 19)
min_cluster_size, max_cluster_size = 0, 0
# print(f"Attempting to create clusters with sizes in [{allowed_cluster_size[0]}, {allowed_cluster_size[-1]}]")
while min_cluster_size not in allowed_cluster_size and max_cluster_size not in allowed_cluster_size:
# Create an instance of the KMeans class. Iterate up 1000 times to find the best clusters
k_means = KMeans(n_clusters=analysis.num_conferences, n_init='auto', max_iter=50) # random_state=RANDOM_STATE
# Fit the data to the model
k_means.fit(X)
# find the size of the smallest cluster
conf_members = [len(k_means.labels_[k_means.labels_ == i]) for i in range(analysis.num_conferences)]
min_cluster_size = min(conf_members)
max_cluster_size = max(conf_members)
# print(f"Min: {min_cluster_size}, Max: {max_cluster_size}")
# Get the cluster labels for each data point
labels = k_means.labels_
# Get the cluster centers
centers = k_means.cluster_centers_
# Add the cluster labels to the dataframe
k_means_df["predicted_cluster"] = labels
# Print information about the results
# print(f"Iterations executed: {k_means.n_iter_}")
Visualize the K-Means Clusters¶
Each cluster represents one potential conference created by the k-means algorithm. The circles around each cluster are centered on the centroid and intersect the data point furthest from the centroid. Clusters are color-coded according to the legend below the map.
The clusters generated by the k-means algorithm are not an attempt to recreate any existing conference. They represent potential conferences based solely on geographic locality.
# Create a figure with an axes object on which we will plot. Pass the projection to that axes.
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection=ccrs.LambertConformal(central_longitude=-100, central_latitude=45)))
ax: cartopy.mpl.geoaxes.GeoAxes
# Set the extent of the map to the contiguous United States
ax.set_extent(USA_BOUNDS)
# Add state boundaries to plot
ax.add_feature(cfeature.STATES)
# Add country borders to plot
ax.add_feature(cfeature.BORDERS)
# Add ocean to plot
ax.add_feature(cfeature.OCEAN)
# Add lakes to plot
ax.add_feature(cfeature.LAKES)
# Add land to plot
ax.add_feature(cfeature.LAND)
# Add coastlines to plot
ax.coastlines()
# Add all the schools to the plot with a different color for each conference
for cluster_label in range(analysis.num_conferences):
cluster_data = k_means_df[k_means_df["predicted_cluster"] == cluster_label]
ax.scatter(cluster_data["longitude"], cluster_data["latitude"], transform=ccrs.Geodetic(), s=20, label=f"Cluster {cluster_label}", color=conference_colors[cluster_label], linewidth=0.5, edgecolor="black", zorder=2)
# add circles around the clusters
for cluster_label in range(analysis.num_conferences):
cluster_data = k_means_df[k_means_df["predicted_cluster"] == cluster_label]
centroid = centers[cluster_label][::1] # longitude, latitude
distances = [np.linalg.norm(np.array([centroid[0], centroid[1]], dtype=np.float64) - np.array([row[3], row[4]], dtype=np.float64)) for row in cluster_data.itertuples()]
radius = max(distances)
# print(f"Cluster {cluster_label} centroid: {cluster_center[1]}, {cluster_center[0]}, radius: {radius}")
ax.add_patch(plt.Circle((centroid[1], centroid[0]), radius, edgecolor=conference_colors[cluster_label], fill=False, linewidth=1.5, transform=ccrs.Geodetic(), zorder=1.5))
# Add a title to the plot
ax.set_title(f"{analysis.season.year} FBS Conference Suggestions - K-Means Clusters")
plt.show()
# Show the legend as a output from the plot
handles, labels = ax.get_legend_handles_labels()
fig_legend = plt.figure(figsize=(2,2))
axi = fig_legend.add_subplot(111)
fig_legend.legend(handles=legend_elements, loc="center", title="Conference")
# Show the legend as a output from the plot
axi.xaxis.set_visible(False)
axi.yaxis.set_visible(False)
# remove bounding box
axi.spines['top'].set_visible(False)
axi.spines['bottom'].set_visible(False)
axi.spines['left'].set_visible(False)
axi.spines['right'].set_visible(False)
axi.set_frame_on(False)