Set UpΒΆ
Imports, Global Variables, and ConstantsΒΆ
import os
import pickle
import numpy as np
import pandas as pd
import pygskin
# constants for saving and loading data
year = 2023
path_to_files = os.path.join(os.getcwd() + os.sep + "data")
# calculation constants
RANDOM_STATE = 42
# plot constants and variables
USA_BOUNDS = [-121, -75, 23, 50]
POINT_SIZE = 20
POINT_ZORDER = 2
LINE_ZORDER = 1.5
conference_colors = ["red", "blue", "green", "orange", "purple", "brown", "pink", "lime", "cyan", "magenta", "yellow", "gray", "olive", "maroon", "navy", "teal", "gold", "darkorange", "darkgreen", "darkred", "darkblue", "darkgray", "darkcyan", "darkmagenta", "darkkhaki", "darkgoldenrod", "darkslategray", "darkolivegreen", "darkseagreen", "darkslateblue", "darkturquoise", "darkviolet", "deeppink", "deepskyblue", "dimgray", "dodgerblue", "firebrick", "forestgreen", "fuchsia", "gainsboro", "ghostwhite", "goldenrod", "greenyellow", "hotpink", "indianred"]
Load Season DataΒΆ
If season data is present in .cfb (pickled) form in the data/year folder, load it. Otherwise, use the CFBD API to download, save, and load it.
If the class definition of a Team, Season, or SeasonAnalyzer is altered, the loaded information will not function as intended. The first troubleshooting step is to delete your data folder and try again.
if os.path.exists(path_to_files + os.sep + str(year) + os.sep + f"season_{year}.cfb"):
print(f"Loading {year} season from file...")
with open(path_to_files + os.sep + str(year) + os.sep + f"season_{year}.cfb", "rb") as f:
season = pickle.load(f)
print(f"Loaded {year} season from file.")
else:
print(f"Loading {year} season from API...")
season = pygskin.Season.from_cfbd_api(year)
print(f"Loaded {year} season from API.")
# create directory for season, if it doesn't exist
if not os.path.exists(path_to_files + os.sep + str(year)):
os.mkdir(path_to_files + os.sep + str(year))
# pickle season
with open(path_to_files + os.sep + str(year) + os.sep + f"season_{year}.cfb", "wb") as f:
pickle.dump(season, f)
# pickle all Teams in Season
for school in season.teams.keys():
with open(path_to_files + os.sep + str(year) + os.sep + f"team_{school}.cfb", "wb") as f:
pickle.dump(season.teams[school], f)
print(f"Saved {year} season to file.")
Loading 2023 season from file... Loaded 2023 season from file.
2024 Season AdjustmentsΒΆ
Because CFBD does not yet have all the data for the 2024 realignment. Thus, we will use manually compiled adjustments to the 2023 season to perform our calculations. These changes are compiled from publicly available announcements from the schools themselves and articles gathering multiple announcements.
# Schools moving to ACC: SMU, Stanford, Cal
season.teams_dict["SMU"].info["conference"] = "ACC"
season.teams_dict["Stanford"].info["conference"] = "ACC"
season.teams_dict["California"].info["conference"] = "ACC"
# Schools moving to Big 12: Arizona State, Arizona, Colorado, Utah
season.teams_dict["Arizona State"].info["conference"] = "Big 12"
season.teams_dict["Arizona"].info["conference"] = "Big 12"
season.teams_dict["Colorado"].info["conference"] = "Big 12"
season.teams_dict["Utah"].info["conference"] = "Big 12"
# Schools moving to Big Ten: UCLA, USC, Washington, Oregon
season.teams_dict["UCLA"].info["conference"] = "Big Ten"
season.teams_dict["USC"].info["conference"] = "Big Ten"
season.teams_dict["Washington"].info["conference"] = "Big Ten"
season.teams_dict["Oregon"].info["conference"] = "Big Ten"
# Schools moving to SEC: Texas, Oklahoma
season.teams_dict["Texas"].info["conference"] = "SEC"
season.teams_dict["Oklahoma"].info["conference"] = "SEC"
# Schools moving to AAC: Army
season.teams_dict["Army"].info["conference"] = "American Athletic"
# Schools moving to C-USA: Kennesaw State
season.teams_dict["Kent State"].info["conference"] = "Conference USA"
# Schools moving up from FCS: Delaware
delaware = pygskin.Team(
school="Delaware",
year=2023,
info={
"id": 99999,
"school": "Delaware",
"mascot": "Blue Hens",
"abbreviation": "DEL",
"alt_name_1": "University of Delaware",
"alt_name_2": "UD",
"alt_name_3": "Blue Hens",
"classification": "None",
"conference": "Conference USA",
"division": "None",
"color": "#00539f",
"alt_color": "#ffd200",
"logos": [
"https://a.espncdn.com/i/teamlogos/ncaa/500-dark/48.png"
],
"twitter": "@UDBlueHens",
"location": {
"venue_id": None,
"name": "Delaware Stadium",
"city": "Newark",
"state": "DE",
"zip": "19716",
"country_code": "US",
"timezone": "America/New_York",
"latitude": 39.6617,
"longitude": -75.7488,
"elevation": 98.0,
"capacity": 18500,
"year_constructed": 1952,
"grass": True,
"dome": False,
}
},
wins=0,
losses=0,
ties=0,
)
season.teams_dict["Delaware"] = delaware
analysis = pygskin.SeasonAnalyzer(season)
Data VisualizationΒΆ
This map shows the actual locations of all FBS schools in the continental United States. Hawai'i is the only school not visible with the default bounds, as it would greatly skew the scale of the map.
import cartopy.mpl.geoaxes
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
# Create a figure with an axes object on which we will plot. Pass the projection to that axes.
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection=ccrs.LambertConformal(central_longitude=-100, central_latitude=45)))
ax: cartopy.mpl.geoaxes.GeoAxes
# Set the extent of the map to the contiguous United States
ax.set_extent(USA_BOUNDS)
# Add state boundaries to plot
ax.add_feature(cfeature.STATES)
# Add country borders to plot
ax.add_feature(cfeature.BORDERS)
# Add ocean to plot
ax.add_feature(cfeature.OCEAN)
# Add lakes to plot
ax.add_feature(cfeature.LAKES)
# Add land to plot
ax.add_feature(cfeature.LAND)
# Add coastlines to plot
ax.coastlines()
# Add the locations of the schools to the plot with a different color for each conference
for conference in analysis.school_locations["conference"].unique():
ax.scatter(analysis.school_locations[analysis.school_locations["conference"] == conference]["longitude"], analysis.school_locations[analysis.school_locations["conference"] == conference]["latitude"], color=conference_colors[conference], s=20, transform=ccrs.Geodetic(), linewidth=0.5, edgecolor="black", zorder=2)
# Add a circle around each conference with a radius of the max distance between the centroid and the schools in the conference
for conference in analysis.school_locations["conference"].unique():
conf_schools = analysis.school_locations[analysis.school_locations["conference"] == conference]
centroid = np.average(conf_schools[["longitude", "latitude"]], axis=0)
max_distance = max([np.linalg.norm(np.array([school.longitude, school.latitude]) - centroid) for school in conf_schools.itertuples()])
# ax.add_patch(plt.Circle((centroid[0], centroid[1]), max_distance, transform=ccrs.Geodetic(), fill=False, color=conference_colors[conference]))
# Add a title to the plot
ax.set_title("2024 FBS Schools")
# create a legend with the names of the conferences
legend_elements = [plt.Line2D([0], [0], marker="o", color="w", label=analysis.conferences[conference], markerfacecolor=conference_colors[conference], markersize=10) for conference in analysis.school_locations["conference"].unique()]
ax.legend(title="Conference", bbox_to_anchor=(1.05, 1), loc="upper left", handles=legend_elements)
plt.show()
Mean and Median Distances within ConferencesΒΆ
In this section, we calculate the distances (as the crow flies) between every pair of schools within each conference. For each conference, we report the mean and median distance values.
Distance CalculationsΒΆ
import math
def haversine(coord1: tuple[int, int], coord2: tuple[int, int]) -> float:
"""Returns the distance (in miles) between two points on the Earth's surface using the Haversine formula.
Args:
coord1 (tuple[int, int]): Longitude and latitude of the first point
coord2 (tuple[int, int]): Longitude and latitude of the second point
Returns:
float: Distance between the two points in miles
"""
R = 3958.8 # Radius of the Earth in miles
# Coordinates in decimal degrees (e.g., 43.60, -79.49)
lon1, lat1 = coord1
lon2, lat2 = coord2
# Convert decimal degrees to radians
phi1, phi2 = math.radians(lat1), math.radians(lat2)
delta_phi = math.radians(lat2 - lat1)
delta_lambda = math.radians(lon2 - lon1)
# Haversine formula
a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
distance = R * c # Output distance in miles
return distance
conf_dist_list = []
for conference in analysis.school_locations["conference"].unique():
for school in analysis.school_locations[analysis.school_locations["conference"] == conference].itertuples():
for school_2 in analysis.school_locations[analysis.school_locations["conference"] == conference].itertuples():
if school == school_2: # don't calculate distance between the same school
continue
dist = haversine((school.longitude, school.latitude), (school_2.longitude, school_2.latitude))
conf_dist_list.append([conference, school.school, school_2.school, dist])
conference_distances = pd.DataFrame(conf_dist_list, columns=["conference", "school_1", "school_2", "distance"])
for conference in conference_distances["conference"].unique():
conference_distances["max_distance"] = conference_distances[conference_distances["conference"] == conference]["distance"].max()
conference_distances["min_distance"] = conference_distances[conference_distances["conference"] == conference]["distance"].min()
conference_distances["avg_distance"] = conference_distances[conference_distances["conference"] == conference]["distance"].mean()
conference_distances["median_distance"] = conference_distances[conference_distances["conference"] == conference]["distance"].median()
Plot Mean vs. Median Distances in ConferencesΒΆ
# create a scatter plot of the average distance between schools in a conference and the median distance between schools in a conference
fig, ax = plt.subplots()
ax: plt.Axes
for conference in conference_distances["conference"].unique():
conf = conference_distances[conference_distances["conference"] == conference]
mean = conference_distances[conference_distances["conference"] == conference]["distance"].mean()
median = conference_distances[conference_distances["conference"] == conference]["distance"].median()
ax.scatter(x=mean, y=median, label=analysis.conferences[conference], color=conference_colors[conference])
ax.set_xlabel("Mean Distance (miles)")
ax.set_ylabel("Median Distance (miles)")
ax.set_title("Mean vs. Median Distance Between Schools in a Conference")
ax.legend(title="Conference", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()
Closest and Farthest School PairsΒΆ
This section displays the closest and farthest pairs of schools in each conference.
closest_pairs = []
farthest_pairs = []
for conference in conference_distances["conference"].unique():
min_dist = conference_distances[conference_distances["conference"] == conference]["distance"].min()
max_dist = conference_distances[conference_distances["conference"] == conference]["distance"].max()
closest_pairs.append((conference_distances[conference_distances['distance'] == min_dist]["school_1"].values[0], conference_distances[conference_distances['distance'] == min_dist]['school_2'].values[0]))
farthest_pairs.append((conference_distances[conference_distances['distance'] == max_dist]["school_1"].values[0], conference_distances[conference_distances['distance'] == max_dist]['school_2'].values[0]))
Closest Pairs of Schools in Each ConferenceΒΆ
# create a map of the closest pairs of schools in each conference, drawing a line between the two schools
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection=ccrs.LambertConformal(central_longitude=-100, central_latitude=45)))
ax: cartopy.mpl.geoaxes.GeoAxes
ax.set_extent(USA_BOUNDS)
ax.add_feature(cfeature.STATES)
ax.add_feature(cfeature.BORDERS)
ax.add_feature(cfeature.OCEAN)
ax.add_feature(cfeature.LAKES)
ax.add_feature(cfeature.LAND)
ax.coastlines()
for i, conference in enumerate(analysis.school_locations["conference"].unique()):
# get closest pair of schools in conference
ax.scatter(analysis.school_locations[analysis.school_locations["school"] == closest_pairs[i][0]]["longitude"], analysis.school_locations[analysis.school_locations["school"] == closest_pairs[i][0]]["latitude"], color=conference_colors[conference], s=20, transform=ccrs.Geodetic(), linewidth=0.5, edgecolor="black", zorder=2)
ax.scatter(analysis.school_locations[analysis.school_locations["school"] == closest_pairs[i][1]]["longitude"], analysis.school_locations[analysis.school_locations["school"] == closest_pairs[i][1]]["latitude"], color=conference_colors[conference], s=20, transform=ccrs.Geodetic(), linewidth=0.5, edgecolor="black", zorder=2)
for conference in conference_distances["conference"].unique():
conf = conference_distances[conference_distances["conference"] == conference]
closest_pair = conf[conf["distance"] == conf["distance"].min()]
school_1 = analysis.school_locations[analysis.school_locations["school"] == closest_pair["school_1"].values[0]].iloc[0]
school_2 = analysis.school_locations[analysis.school_locations["school"] == closest_pair["school_2"].values[0]].iloc[0]
ax.plot([school_1.longitude, school_2.longitude], [school_1.latitude, school_2.latitude], color=conference_colors[conference], transform=ccrs.Geodetic(), zorder=LINE_ZORDER)
print(f"Closest pair of schools in {analysis.conferences[conference]}: {school_1.school} and {school_2.school}")
ax.set_title("Closest Pairs of Schools in Each Conference")
# add legend
handles, labels = ax.get_legend_handles_labels()
fig_legend = plt.figure(figsize=(2,2))
ax.legend(title="Conference", bbox_to_anchor=(1.05, 1), loc="upper left", handles=legend_elements)
plt.show()
Closest pair of schools in Mountain West: Colorado State and Wyoming Closest pair of schools in Mid-American: Bowling Green and Toledo Closest pair of schools in SEC: Alabama and Mississippi State Closest pair of schools in Sun Belt: South Alabama and Southern Mississippi Closest pair of schools in Big 12: BYU and Utah Closest pair of schools in American Athletic: Navy and Temple Closest pair of schools in ACC: Duke and North Carolina Closest pair of schools in FBS Independents: Connecticut and UMass Closest pair of schools in Conference USA: New Mexico State and UTEP Closest pair of schools in Big Ten: UCLA and USC Closest pair of schools in Pac-12: Oregon State and Washington State
<Figure size 200x200 with 0 Axes>
Farthest Pair of Schools in Each ConferenceΒΆ
# create a map of the farthest pairs of schools in each conference, drawing a line between the two schools
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection=ccrs.LambertConformal(central_longitude=-100, central_latitude=45)))
ax: cartopy.mpl.geoaxes.GeoAxes
ax.set_extent(USA_BOUNDS)
ax.add_feature(cfeature.STATES)
ax.add_feature(cfeature.BORDERS)
ax.add_feature(cfeature.OCEAN)
ax.add_feature(cfeature.LAKES)
ax.add_feature(cfeature.LAND)
ax.coastlines()
for i, conference in enumerate(analysis.school_locations["conference"].unique()):
# get farthest pair of schools in conference
ax.scatter(analysis.school_locations[analysis.school_locations["school"] == farthest_pairs[i][0]]["longitude"], analysis.school_locations[analysis.school_locations["school"] == farthest_pairs[i][0]]["latitude"], color=conference_colors[conference], s=20, transform=ccrs.Geodetic(), linewidth=0.5, edgecolor="black", zorder=2)
ax.scatter(analysis.school_locations[analysis.school_locations["school"] == farthest_pairs[i][1]]["longitude"], analysis.school_locations[analysis.school_locations["school"] == farthest_pairs[i][1]]["latitude"], color=conference_colors[conference], s=20, transform=ccrs.Geodetic(), linewidth=0.5, edgecolor="black", zorder=2)
print(f"Farthest pair of schools in {analysis.conferences[conference]}: {farthest_pairs[i][0]} and {farthest_pairs[i][1]}")
for conference in conference_distances["conference"].unique():
conf = conference_distances[conference_distances["conference"] == conference]
farthest_pair = conf[conf["distance"] == conf["distance"].max()]
school_1 = analysis.school_locations[analysis.school_locations["school"] == farthest_pair["school_1"].values[0]].iloc[0]
school_2 = analysis.school_locations[analysis.school_locations["school"] == farthest_pair["school_2"].values[0]].iloc[0]
ax.plot([school_1.longitude, school_2.longitude], [school_1.latitude, school_2.latitude], color=conference_colors[conference], transform=ccrs.Geodetic(), zorder=LINE_ZORDER)
ax.set_title("Farthest Pairs of Schools in Each Conference")
handles, labels = ax.get_legend_handles_labels()
fig_legend = plt.figure(figsize=(2,2))
ax.legend(title="Conference", bbox_to_anchor=(1.05, 1), loc="upper left", handles=legend_elements)
plt.show()
Farthest pair of schools in Mountain West: Air Force and Hawai'i Farthest pair of schools in Mid-American: Buffalo and Northern Illinois Farthest pair of schools in SEC: South Carolina and Texas Farthest pair of schools in Sun Belt: Old Dominion and Texas State Farthest pair of schools in Big 12: UCF and Utah Farthest pair of schools in American Athletic: Army and UT San Antonio Farthest pair of schools in ACC: Boston College and Stanford Farthest pair of schools in FBS Independents: Notre Dame and UMass Farthest pair of schools in Conference USA: UTEP and Delaware Farthest pair of schools in Big Ten: Oregon and Rutgers Farthest pair of schools in Pac-12: Oregon State and Washington State
<Figure size 200x200 with 0 Axes>
K-Nearest Neighbors ModelΒΆ
Use the k-nearest neighbors algorithm to predict teams' conference based exclusively on longitude and latitude.
The k-nearest neighbors algorithm is a classification algorithm that attempts to determine which class (conference) a data point belongs to. Because college football conferences are no longer geographically based, it becomes apparent that exclusively using longititude and latitude to predict a school's conference is barely more accurate than random chance.
Due to the way that team information is reported by the CFBD API, it is necessary to drop all schools with a conference value of "FBS Independents". These schools do not actually belong to a conference, so they should not be included in the training data.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
print("Training KNN model...")
# Preprocess the data. Drop school and conference columns, use conference as target
df = analysis.school_locations.copy()
# remove schools with the conference name "FBS Independents"
df = df[df["conference"] != analysis.conferences.index("FBS Independents")]
X = df.drop(columns=["school", "conference"])
y = df["conference"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
knn = KNeighborsClassifier(n_neighbors=5)
# train the model
knn.fit(X_train, y_train)
# X has columns longitude and latitude
# y has column conference
# test the model
y_pred = knn.predict(X_test)
# print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
knn_df = pd.DataFrame(columns=["school", "longitude", "latitude", "conference", "predicted", "correct"])
# use the coordinates from the test set to find the school name
knn_df["school"] = df[df["longitude"].isin(X_test["longitude"]) & df["latitude"].isin(X_test["latitude"])]["school"]
knn_df["longitude"] = X_test["longitude"]
knn_df["latitude"] = X_test["latitude"]
knn_df["conference"] = y_test
knn_df["predicted"] = y_pred
knn_df["correct"] = knn_df["conference"] == knn_df["predicted"]
printable_knn_df = knn_df.drop(columns=["longitude", "latitude"])
printable_knn_df["conference"] = [analysis.conferences[conference] for conference in printable_knn_df["conference"]]
printable_knn_df["predicted"] = [analysis.conferences[conference] for conference in printable_knn_df["predicted"]]
print(printable_knn_df)
Training KNN model... Accuracy: 0.25925925925925924 school conference predicted correct 4 Arizona Big 12 Mid-American False 10 Ball State Mid-American Big Ten False 11 Baylor Big 12 ACC False 18 Central Michigan Mid-American SEC False 19 Charlotte American Athletic Mountain West False 27 East Carolina American Athletic Conference USA False 28 Eastern Michigan Mid-American Mountain West False 32 Florida State ACC Conference USA False 37 Georgia Tech ACC Sun Belt False 41 Indiana Big Ten ACC False 46 Kansas Big 12 SEC False 56 Marshall Sun Belt Mid-American False 57 Maryland Big Ten American Athletic False 65 Mississippi State SEC Mountain West False 70 Nevada Mountain West Mountain West True 71 New Mexico Mountain West American Athletic False 82 Old Dominion Sun Belt SEC False 83 Ole Miss SEC ACC False 86 Penn State Big Ten SEC False 95 South Alabama Sun Belt Mid-American False 97 Southern Mississippi Sun Belt Mid-American False 99 Stanford ACC Mid-American False 102 Temple American Athletic ACC False 107 Texas Tech Big 12 SEC False 118 Utah Big 12 ACC False 125 Wake Forest ACC SEC False 130 West Virginia Big 12 Big 12 True
Actual vs. Predicted ClustersΒΆ
Evaluate the results of the k-nearest neighbors algorithm. Plot the points with a correctly predicted conference in green and incorrectly predicted in red. Output the results.
# Create a figure with an axes object on which we will plot. Pass the projection to that axes.
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection=ccrs.LambertConformal(central_longitude=-100, central_latitude=45)))
ax: cartopy.mpl.geoaxes.GeoAxes
# Set the extent of the map to the contiguous United States
ax.set_extent(USA_BOUNDS)
# Add state boundaries to plot
ax.add_feature(cfeature.STATES)
# Add country borders to plot
ax.add_feature(cfeature.BORDERS)
# Add ocean to plot
ax.add_feature(cfeature.OCEAN)
# Add lakes to plot
ax.add_feature(cfeature.LAKES)
# Add land to plot
ax.add_feature(cfeature.LAND)
# Add coastlines to plot
ax.coastlines()
# Plot each school with a color corresponding to the success of the prediction
for index, row in knn_df.iterrows():
# add a label to the plot for the correctly predicted schools
ax.scatter(row["longitude"], row["latitude"], transform=ccrs.Geodetic(), s=POINT_SIZE, color="green" if row["correct"] else "red", linewidth=0.5, edgecolor="black", zorder=2)
# Add text to the plot
for index, school in knn_df.iterrows():
ax.text(school["longitude"], school["latitude"], analysis.conferences[school["conference"]], transform=ccrs.Geodetic(), horizontalalignment="left", verticalalignment="bottom")
# Add a legend to the plot
ax.legend(handles=[plt.Line2D([0], [0], color="green", lw=4), plt.Line2D([0], [0], color="red", lw=4)], labels=["Correct", "Incorrect"])
# Add a title to the plot
ax.set_title("2024 FBS Conference Members - K-Nearest Neighbors Predictions")
plt.show()
K-Means ClusteringΒΆ
The k-means algorithm is a cluster analysis algorithm, a mathematical process used to group data points based on similarity. The clusters it creates contain data points which are more similar to one another than to the data points outside of the cluster. Here, it is used to form geographically-based clusters (conferences) of FBS schools. This clusters represent potential conferences that maintain the geographical similarity that was originally present in college athletic conferences while using the current set of FBS schools.
In the year 2024, there are 11 conferences in the FBS. The k-means algorithm uses a k value equal to the number of conferences in the FBS that season, not counting FBS Independents. Because FBS conferences typically have 8 to 18 members, this is the range of acceptable points per cluster.
from sklearn.cluster import KMeans
k_means_df = analysis.school_locations.copy()
X = k_means_df.drop(columns=["school", "conference"])
allowed_cluster_size = range(8, 19)
min_cluster_size, max_cluster_size = 0, 0
# print(f"Attempting to create clusters with sizes in [{allowed_cluster_size[0]}, {allowed_cluster_size[-1]}]")
while min_cluster_size not in allowed_cluster_size and max_cluster_size not in allowed_cluster_size:
# Create an instance of the KMeans class. Iterate up 1000 times to find the best clusters
k_means = KMeans(n_clusters=analysis.num_conferences, n_init='auto', max_iter=50) # random_state=RANDOM_STATE
# Fit the data to the model
k_means.fit(X)
# find the size of the smallest cluster
conf_members = [len(k_means.labels_[k_means.labels_ == i]) for i in range(analysis.num_conferences)]
min_cluster_size = min(conf_members)
max_cluster_size = max(conf_members)
# print(f"Min: {min_cluster_size}, Max: {max_cluster_size}")
# Get the cluster labels for each data point
labels = k_means.labels_
# Get the cluster centers
centers = k_means.cluster_centers_
# Add the cluster labels to the dataframe
k_means_df["predicted_cluster"] = labels
# Print information about the results
# print(f"Iterations executed: {k_means.n_iter_}")
Visualize the K-Means ClustersΒΆ
Each cluster represents one potential conference created by the k-means algorithm. The circles around each cluster are centered on the centroid and intersect the data point furthest from the centroid. Clusters are color-coded according to the legend below the map.
The clusters generated by the k-means algorithm are not an attempt to recreate any existing conference. They represent potential conferences based solely on geographic locality.
# Create a figure with an axes object on which we will plot. Pass the projection to that axes.
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection=ccrs.LambertConformal(central_longitude=-100, central_latitude=45)))
ax: cartopy.mpl.geoaxes.GeoAxes
# Set the extent of the map to the contiguous United States
ax.set_extent(USA_BOUNDS)
# Add state boundaries to plot
ax.add_feature(cfeature.STATES)
# Add country borders to plot
ax.add_feature(cfeature.BORDERS)
# Add ocean to plot
ax.add_feature(cfeature.OCEAN)
# Add lakes to plot
ax.add_feature(cfeature.LAKES)
# Add land to plot
ax.add_feature(cfeature.LAND)
# Add coastlines to plot
ax.coastlines()
# Add all the schools to the plot with a different color for each conference
for cluster_label in range(analysis.num_conferences):
cluster_data = k_means_df[k_means_df["predicted_cluster"] == cluster_label]
ax.scatter(cluster_data["longitude"], cluster_data["latitude"], transform=ccrs.Geodetic(), s=20, label=f"Cluster {cluster_label}", color=conference_colors[cluster_label], linewidth=0.5, edgecolor="black", zorder=2)
# add circles around the clusters
for cluster_label in range(analysis.num_conferences):
cluster_data = k_means_df[k_means_df["predicted_cluster"] == cluster_label]
centroid = centers[cluster_label][::1] # longitude, latitude
distances = [np.linalg.norm(np.array([centroid[0], centroid[1]], dtype=np.float64) - np.array([row[3], row[4]], dtype=np.float64)) for row in cluster_data.itertuples()]
radius = max(distances)
# print(f"Cluster {cluster_label} centroid: {cluster_center[1]}, {cluster_center[0]}, radius: {radius}")
ax.add_patch(plt.Circle((centroid[1], centroid[0]), radius, edgecolor=conference_colors[cluster_label], fill=False, linewidth=1.5, transform=ccrs.Geodetic(), zorder=1.5))
# Add a title to the plot
ax.set_title("2024 FBS Conference Suggestions - K-Means Clusters")
ax.legend(title="Conference", bbox_to_anchor=(1.05, 1), loc="upper left", handles=legend_elements)
plt.show()
ConclusionsΒΆ
With this analysis, it becomes apparent that the athletic conferences of the FBS member schools is no longer based primarily on geography. In fact, conferences have very little relation to the location of their member schools. Including 2024 realignments, the conferences have moved even further away from their geographic roots.