import os
import pickle
import numpy as np
import pandas as pd

import pygskin

# constants for saving and loading data
year = 2023
path_to_files = os.path.join(os.getcwd() + os.sep + "data")

# calculation constants
RANDOM_STATE = 42

# plot constants and variables
USA_BOUNDS = [-121, -75, 23, 50]
POINT_SIZE = 20
POINT_ZORDER = 2
LINE_ZORDER = 1.5

conference_colors = ["red", "blue", "green", "orange", "purple", "brown", "pink", "lime", "cyan", "magenta", "yellow", "gray", "olive", "maroon", "navy", "teal", "gold", "darkorange", "darkgreen", "darkred", "darkblue", "darkgray", "darkcyan", "darkmagenta", "darkkhaki", "darkgoldenrod", "darkslategray", "darkolivegreen", "darkseagreen", "darkslateblue", "darkturquoise", "darkviolet", "deeppink", "deepskyblue", "dimgray", "dodgerblue", "firebrick", "forestgreen", "fuchsia", "gainsboro", "ghostwhite", "goldenrod", "greenyellow", "hotpink", "indianred"]

if os.path.exists(path_to_files + os.sep + str(year) + os.sep + f"season_{year}.cfb"):
    print(f"Loading {year} season from file...")
    with open(path_to_files + os.sep + str(year) + os.sep + f"season_{year}.cfb", "rb") as f:
        season = pickle.load(f)
    print(f"Loaded {year} season from file.")
else:
    print(f"Loading {year} season from API...")
    season = pygskin.Season.from_cfbd_api(year)
    print(f"Loaded {year} season from API.")
    # create directory for season, if it doesn't exist
    if not os.path.exists(path_to_files + os.sep + str(year)):
        os.mkdir(path_to_files + os.sep + str(year))
    # pickle season
    with open(path_to_files + os.sep + str(year) + os.sep + f"season_{year}.cfb", "wb") as f:
        pickle.dump(season, f)
    # pickle all Teams in Season
    for school in season.teams.keys():
        with open(path_to_files + os.sep + str(year) + os.sep + f"team_{school}.cfb", "wb") as f:
            pickle.dump(season.teams[school], f)
    print(f"Saved {year} season to file.")

analysis = pygskin.SeasonAnalyzer(season)

Loading 2023 season from file...
Loaded 2023 season from file.

import cartopy.mpl.geoaxes
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt

# Create a figure with an axes object on which we will plot. Pass the projection to that axes.
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection=ccrs.LambertConformal(central_longitude=-100, central_latitude=45)))
ax: cartopy.mpl.geoaxes.GeoAxes
# Set the extent of the map to the contiguous United States
ax.set_extent(USA_BOUNDS)
# Add state boundaries to plot
ax.add_feature(cfeature.STATES)
# Add country borders to plot
ax.add_feature(cfeature.BORDERS)
# Add ocean to plot
ax.add_feature(cfeature.OCEAN)
# Add lakes to plot
ax.add_feature(cfeature.LAKES)
# Add land to plot
ax.add_feature(cfeature.LAND)
# Add coastlines to plot
ax.coastlines()

# Add the locations of the schools to the plot with a different color for each conference
for conference in analysis.school_locations["conference"].unique():
    ax.scatter(analysis.school_locations[analysis.school_locations["conference"] == conference]["longitude"], analysis.school_locations[analysis.school_locations["conference"] == conference]["latitude"], color=conference_colors[conference], s=20, transform=ccrs.Geodetic(), linewidth=0.5, edgecolor="black", zorder=2)

# Add a circle around each conference with a radius of the max distance between the centroid and the schools in the conference
for conference in analysis.school_locations["conference"].unique():
    conf_schools = analysis.school_locations[analysis.school_locations["conference"] == conference]
    centroid = np.average(conf_schools[["longitude", "latitude"]], axis=0)
    max_distance = max([np.linalg.norm(np.array([school.longitude, school.latitude]) - centroid) for school in conf_schools.itertuples()])
    # ax.add_patch(plt.Circle((centroid[0], centroid[1]), max_distance, transform=ccrs.Geodetic(), fill=False, color=conference_colors[conference]))
    
# Add a title to the plot
ax.set_title(f"{analysis.season.year} FBS Schools")

# create a legend with the names of the conferences
legend_elements = [plt.Line2D([0], [0], marker="o", color="w", label=analysis.conferences[conference], markerfacecolor=conference_colors[conference], markersize=10) for conference in analysis.school_locations["conference"].unique()]

plt.show()

# Show the legend as a output from the plot
handles, labels = ax.get_legend_handles_labels()
fig_legend = plt.figure(figsize=(2,2))
axi = fig_legend.add_subplot(111)            
fig_legend.legend(handles=legend_elements, loc="center", title="Conference")
axi.xaxis.set_visible(False)
axi.yaxis.set_visible(False)
# remove bounding box
axi.spines['top'].set_visible(False)
axi.spines['bottom'].set_visible(False)
axi.spines['left'].set_visible(False)
axi.spines['right'].set_visible(False)
axi.set_frame_on(False)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

print("Training KNN model...")

# Preprocess the data. Drop school and conference columns, use conference as target
df = analysis.school_locations.copy()
# remove schools with the conference name "FBS Independents"
df = df[df["conference"] != analysis.conferences.index("FBS Independents")]

X = df.drop(columns=["school", "conference"])
y = df["conference"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
knn = KNeighborsClassifier(n_neighbors=5)
# train the model
knn.fit(X_train, y_train)

# X has columns longitude and latitude
# y has column conference

# test the model
y_pred = knn.predict(X_test)
# print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

knn_df = pd.DataFrame(columns=["school", "longitude", "latitude", "conference", "predicted", "correct"])
# use the coordinates from the test set to find the school name
knn_df["school"] = df[df["longitude"].isin(X_test["longitude"]) & df["latitude"].isin(X_test["latitude"])]["school"]
knn_df["longitude"] = X_test["longitude"]
knn_df["latitude"] = X_test["latitude"]
knn_df["conference"] = y_test
knn_df["predicted"] = y_pred
knn_df["correct"] = knn_df["conference"] == knn_df["predicted"]

printable_knn_df = knn_df.drop(columns=["longitude", "latitude"])
printable_knn_df["conference"] = [analysis.conferences[conference] for conference in printable_knn_df["conference"]]
printable_knn_df["predicted"] = [analysis.conferences[conference] for conference in printable_knn_df["predicted"]]

print(printable_knn_df)

Training KNN model...
Accuracy: 0.23076923076923078
               school         conference          predicted  correct
4             Arizona             Pac-12  American Athletic    False
11             Baylor             Big 12            Big Ten    False
12        Boise State      Mountain West       Mid-American    False
19          Charlotte  American Athletic             Pac-12    False
20         Cincinnati             Big 12      Mountain West    False
28   Eastern Michigan       Mid-American                SEC    False
29            Florida                SEC     Conference USA    False
33       Fresno State      Mountain West            Big Ten    False
38            Hawai'i      Mountain West             Pac-12    False
42               Iowa            Big Ten       Mid-American    False
47       Kansas State             Big 12                ACC    False
57           Maryland            Big Ten           Sun Belt    False
58            Memphis  American Athletic             Big 12    False
66           Missouri                SEC     Conference USA    False
71         New Mexico      Mountain West  American Athletic    False
83           Ole Miss                SEC                ACC    False
84             Oregon             Pac-12             Pac-12     True
87         Pittsburgh                ACC                SEC    False
96     South Carolina                SEC           Sun Belt    False
98      South Florida  American Athletic       Mid-American    False
100          Syracuse                ACC                ACC     True
103         Tennessee                SEC  American Athletic    False
108            Toledo       Mid-American                ACC    False
118              Utah             Pac-12             Pac-12     True
123          Virginia                ACC                SEC    False
128  Western Kentucky     Conference USA             Big 12    False

# Create a figure with an axes object on which we will plot. Pass the projection to that axes.
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection=ccrs.LambertConformal(central_longitude=-100, central_latitude=45)))
ax: cartopy.mpl.geoaxes.GeoAxes
# Set the extent of the map to the contiguous United States
ax.set_extent(USA_BOUNDS)
# Add state boundaries to plot
ax.add_feature(cfeature.STATES)
# Add country borders to plot
ax.add_feature(cfeature.BORDERS)
# Add ocean to plot
ax.add_feature(cfeature.OCEAN)
# Add lakes to plot
ax.add_feature(cfeature.LAKES)
# Add land to plot
ax.add_feature(cfeature.LAND)
# Add coastlines to plot
ax.coastlines()

# Plot each school with a color corresponding to the success of the prediction
for index, row in knn_df.iterrows():
    # add a label to the plot for the correctly predicted schools
    ax.scatter(row["longitude"], row["latitude"], transform=ccrs.Geodetic(), s=POINT_SIZE, color="green" if row["correct"] else "red", linewidth=0.5, edgecolor="black", zorder=2)

# Add text to the plot
for index, school in knn_df.iterrows():
    ax.text(school["longitude"], school["latitude"], analysis.conferences[school["conference"]], transform=ccrs.Geodetic(), horizontalalignment="left", verticalalignment="bottom")

# Add a legend to the plot
ax.legend(handles=[plt.Line2D([0], [0], color="green", lw=4), plt.Line2D([0], [0], color="red", lw=4)], labels=["Correct", "Incorrect"])

# Add a title to the plot
ax.set_title(f"{analysis.season.year} FBS Conference Members - K-Nearest Neighbors Predictions")

plt.show()

from sklearn.cluster import KMeans

k_means_df = analysis.school_locations.copy()
X = k_means_df.drop(columns=["school", "conference"])

allowed_cluster_size = range(8, 19)
min_cluster_size, max_cluster_size = 0, 0
# print(f"Attempting to create clusters with sizes in [{allowed_cluster_size[0]}, {allowed_cluster_size[-1]}]")
while min_cluster_size not in allowed_cluster_size and max_cluster_size not in allowed_cluster_size:
    # Create an instance of the KMeans class. Iterate up 1000 times to find the best clusters
    k_means = KMeans(n_clusters=analysis.num_conferences, n_init='auto', max_iter=50)    # random_state=RANDOM_STATE
    # Fit the data to the model
    k_means.fit(X)
    # find the size of the smallest cluster
    conf_members = [len(k_means.labels_[k_means.labels_ == i]) for i in range(analysis.num_conferences)]
    min_cluster_size = min(conf_members)
    max_cluster_size = max(conf_members)
    # print(f"Min: {min_cluster_size}, Max: {max_cluster_size}")

# Get the cluster labels for each data point
labels = k_means.labels_

# Get the cluster centers
centers = k_means.cluster_centers_

# Add the cluster labels to the dataframe
k_means_df["predicted_cluster"] = labels

# Print information about the results
# print(f"Iterations executed: {k_means.n_iter_}")

# Create a figure with an axes object on which we will plot. Pass the projection to that axes.
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection=ccrs.LambertConformal(central_longitude=-100, central_latitude=45)))
ax: cartopy.mpl.geoaxes.GeoAxes
# Set the extent of the map to the contiguous United States
ax.set_extent(USA_BOUNDS)
# Add state boundaries to plot
ax.add_feature(cfeature.STATES)
# Add country borders to plot
ax.add_feature(cfeature.BORDERS)
# Add ocean to plot
ax.add_feature(cfeature.OCEAN)
# Add lakes to plot
ax.add_feature(cfeature.LAKES)
# Add land to plot
ax.add_feature(cfeature.LAND)
# Add coastlines to plot
ax.coastlines()

# Add all the schools to the plot with a different color for each conference
for cluster_label in range(analysis.num_conferences):
    cluster_data = k_means_df[k_means_df["predicted_cluster"] == cluster_label]
    ax.scatter(cluster_data["longitude"], cluster_data["latitude"], transform=ccrs.Geodetic(), s=20, label=f"Cluster {cluster_label}", color=conference_colors[cluster_label], linewidth=0.5, edgecolor="black", zorder=2)

# add circles around the clusters
for cluster_label in range(analysis.num_conferences):
    cluster_data = k_means_df[k_means_df["predicted_cluster"] == cluster_label]
    centroid = centers[cluster_label][::1] # longitude, latitude
    distances = [np.linalg.norm(np.array([centroid[0], centroid[1]], dtype=np.float64) - np.array([row[3], row[4]], dtype=np.float64)) for row in cluster_data.itertuples()]
    radius = max(distances)
    # print(f"Cluster {cluster_label} centroid: {cluster_center[1]}, {cluster_center[0]}, radius: {radius}")
    ax.add_patch(plt.Circle((centroid[1], centroid[0]), radius, edgecolor=conference_colors[cluster_label], fill=False, linewidth=1.5, transform=ccrs.Geodetic(), zorder=1.5))

# Add a title to the plot
ax.set_title(f"{analysis.season.year} FBS Conference Suggestions - K-Means Clusters")

plt.show()

# Show the legend as a output from the plot
handles, labels = ax.get_legend_handles_labels()
fig_legend = plt.figure(figsize=(2,2))
axi = fig_legend.add_subplot(111)            
fig_legend.legend(handles=legend_elements, loc="center", title="Conference")

# Show the legend as a output from the plot
axi.xaxis.set_visible(False)
axi.yaxis.set_visible(False)
# remove bounding box
axi.spines['top'].set_visible(False)
axi.spines['bottom'].set_visible(False)
axi.spines['left'].set_visible(False)
axi.spines['right'].set_visible(False)
axi.set_frame_on(False)

Set Up¶

Imports, Global Variables, and Constants¶

Load Season Data¶

Data Visualization¶

K-Nearest Neighbors Model¶

Actual vs. Predicted Clusters¶

K-Means Clustering¶

Visualize the K-Means Clusters¶