# Import the necessary libraries
# NOTE: you may need to install the libraries first via "!pip install"
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
import statsmodels.formula.api as sm
import os
import datetime
from unicodedata import normalize
from io import StringIO
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score

# Load the csv into a pandas dataframe
foul_df = pd.read_csv("foul_data_2015_2024.csv", low_memory=False)
foul_df.head()

# Create a new dataframe with selected columns
cols = [
    "date", "time", "season", "home_team", "away_team",
    "type", "committing", "disadvantaged", "decision", 
    "committing_team", "disadvantaged_team", "OFFICIAL_1"
]
foul_df_clean = foul_df[cols].copy()

# Rename columns 
foul_df_clean = foul_df_clean.rename(columns={
    "home_team": "home",
    "away_team": "away",
    "committing": "player",
    "disadvantaged": "opponent",
    "committing_team": "player team", 
    "disadvantaged_team": "opponent team",
    "OFFICIAL_1": "official"
})

# Map team abbreviations to names
all_teams = {
    "ATL": "Hawks", "BOS": "Celtics", "BKN": "Nets", "CHA": "Hornets", "CHI": "Bulls", 
    "CLE": "Cavaliers", "DAL": "Mavericks", "DEN": "Nuggets", "DET": "Pistons", "GSW": "Warriors", 
    "HOU": "Rockets", "IND": "Pacers", "LAC": "Clippers", "LAL": "Lakers", "MEM": "Grizzlies", 
    "MIA": "Heat", "MIL": "Bucks", "MIN": "Timberwolves", "NOP": "Pelicans", "NYK": "Knicks", 
    "OKC": "Thunder", "ORL": "Magic", "PHI": "76ers", "PHX": "Suns", "POR": "Trail Blazers", 
    "SAC": "Kings", "SAS": "Spurs", "TOR": "Raptors", "TOT": "Total", "UTA": "Jazz", "WAS": "Wizards"
}

# Map team abbreviations to their names for the player and opponent team columns
foul_df_clean["player team"] = foul_df_clean["player team"].map(all_teams)
foul_df_clean["opponent team"] = foul_df_clean["opponent team"].map(all_teams)

foul_df_clean.head()

# Remove rows where the decision column is NaN
foul_df_clean = foul_df_clean[~foul_df_clean["decision"].isna()].reset_index(drop=True)

def correct_decision(decision):
    if decision in ["CC", "CNC"]:
        return 1
    else:
        return 0

# Create a column that serves as a binary indicator for whether the referees acted correctly or incorrectly
foul_df_clean["correct decision"] = foul_df_clean["decision"].apply(correct_decision)
foul_df_clean.head()

# Splitting time into minutes and seconds
foul_df_clean[["min", "sec"]] = foul_df_clean["time"].str.split(":", expand=True)
foul_df_clean["min"] = foul_df_clean["min"].astype(int)
foul_df_clean["sec"] = foul_df_clean["sec"].astype(float)

# Calculate total time in seconds
foul_df_clean["seconds"] = foul_df_clean["min"] * 60 + foul_df_clean["sec"]
foul_df_clean.head()

# Group by season and calculate the mean accuracy of decisions for each season
season_accuracy = foul_df_clean.groupby("season")["correct decision"].mean()

# Plotting
plt.figure(figsize=(10, 6))
season_accuracy.plot(marker="o", linestyle="-")
plt.title("Accuracy of Foul Calls Over Seasons")
plt.xlabel("Season")
plt.ylabel("Mean Call Accuracy")
plt.xticks(season_accuracy.index)  
plt.grid(True)
plt.tight_layout()
plt.show()

# Get the number of unique team names
teams = foul_df_clean["home"].unique()

# Dictionary to store the results
team_season_incorrect_counts = {}

for team in teams:
    # Initialize a dictionary for the current team
    team_season_incorrect_counts[team] = {}
    
    for season in foul_df_clean["season"].unique():
        # Filter data for the current team and season
        team_season_data = foul_df_clean[(foul_df_clean["player team"] == team) & (foul_df_clean["season"] == season)]
        
        # Count observations where correct decision is 0 (incorrect)
        count_incorrect = len(team_season_data[(team_season_data["correct decision"] == 0)])
        
        # Store the count for the current season and move onto the next season
        team_season_incorrect_counts[team][season] = count_incorrect

counter = 0
for team, season_counts in team_season_incorrect_counts.items():
    counter += 1
    if counter == 5:
        break
    print(f"Team: {team}")
    for season, count in season_counts.items():
        print(f"{season} Season: {count} incorrect calls")

Team: Rockets
2015 Season: 4 incorrect calls
2016 Season: 45 incorrect calls
2017 Season: 26 incorrect calls
2018 Season: 12 incorrect calls
2019 Season: 43 incorrect calls
2020 Season: 10 incorrect calls
2021 Season: 7 incorrect calls
2022 Season: 10 incorrect calls
2023 Season: 14 incorrect calls
2024 Season: 25 incorrect calls
Team: Celtics
2015 Season: 7 incorrect calls
2016 Season: 13 incorrect calls
2017 Season: 28 incorrect calls
2018 Season: 18 incorrect calls
2019 Season: 24 incorrect calls
2020 Season: 16 incorrect calls
2021 Season: 21 incorrect calls
2022 Season: 30 incorrect calls
2023 Season: 31 incorrect calls
2024 Season: 9 incorrect calls
Team: Nuggets
2015 Season: 10 incorrect calls
2016 Season: 24 incorrect calls
2017 Season: 14 incorrect calls
2018 Season: 22 incorrect calls
2019 Season: 29 incorrect calls
2020 Season: 26 incorrect calls
2021 Season: 15 incorrect calls
2022 Season: 19 incorrect calls
2023 Season: 20 incorrect calls
2024 Season: 18 incorrect calls
Team: Nets
2015 Season: 14 incorrect calls
2016 Season: 19 incorrect calls
2017 Season: 17 incorrect calls
2018 Season: 18 incorrect calls
2019 Season: 23 incorrect calls
2020 Season: 23 incorrect calls
2021 Season: 17 incorrect calls
2022 Season: 27 incorrect calls
2023 Season: 16 incorrect calls
2024 Season: 11 incorrect calls

# Dictionary to store the results
team_season_favorable_counts = {}

for team in teams:
    team_season_favorable_counts[team] = {}
    
    for season in foul_df_clean["season"].unique():
        # Simple change: player team -> opponent team
        # We want to find observations where players on the current team were 
        # given incorrect foul calls (which put the other team at a disadvantage)
        team_season_data = foul_df_clean[(foul_df_clean["opponent team"] == team) & (foul_df_clean["season"] == season)]
        
        # Count observations where correct decision is 0 (incorrect)
        count_favorable = len(team_season_data[(team_season_data["correct decision"] == 0)])
        
        # Store the count for the current season and move onto the next season
        team_season_favorable_counts[team][season] = count_favorable

counter = 0
for team, season_counts in team_season_favorable_counts.items():
    counter += 1
    if counter == 5:
        break
    print(f"Team: {team}")
    for season, count in season_counts.items():
        print(f"{season} Season: {count} favorable incorrect calls")

Team: Rockets
2015 Season: 6 favorable incorrect calls
2016 Season: 22 favorable incorrect calls
2017 Season: 16 favorable incorrect calls
2018 Season: 14 favorable incorrect calls
2019 Season: 25 favorable incorrect calls
2020 Season: 21 favorable incorrect calls
2021 Season: 6 favorable incorrect calls
2022 Season: 15 favorable incorrect calls
2023 Season: 14 favorable incorrect calls
2024 Season: 28 favorable incorrect calls
Team: Celtics
2015 Season: 5 favorable incorrect calls
2016 Season: 22 favorable incorrect calls
2017 Season: 12 favorable incorrect calls
2018 Season: 17 favorable incorrect calls
2019 Season: 20 favorable incorrect calls
2020 Season: 21 favorable incorrect calls
2021 Season: 12 favorable incorrect calls
2022 Season: 23 favorable incorrect calls
2023 Season: 31 favorable incorrect calls
2024 Season: 15 favorable incorrect calls
Team: Nuggets
2015 Season: 8 favorable incorrect calls
2016 Season: 18 favorable incorrect calls
2017 Season: 10 favorable incorrect calls
2018 Season: 27 favorable incorrect calls
2019 Season: 30 favorable incorrect calls
2020 Season: 29 favorable incorrect calls
2021 Season: 30 favorable incorrect calls
2022 Season: 36 favorable incorrect calls
2023 Season: 39 favorable incorrect calls
2024 Season: 11 favorable incorrect calls
Team: Nets
2015 Season: 6 favorable incorrect calls
2016 Season: 12 favorable incorrect calls
2017 Season: 14 favorable incorrect calls
2018 Season: 30 favorable incorrect calls
2019 Season: 29 favorable incorrect calls
2020 Season: 24 favorable incorrect calls
2021 Season: 12 favorable incorrect calls
2022 Season: 20 favorable incorrect calls
2023 Season: 25 favorable incorrect calls
2024 Season: 11 favorable incorrect calls

# Dictionary to store the results
home_away_counts = {"Home":{},"Away":{}}

for season in foul_df_clean["season"].unique():
    home_season_inc = foul_df_clean[(foul_df_clean["player team"] == foul_df_clean["home"]) & (foul_df_clean["season"] == season)]
    away_season_inc = foul_df_clean[(foul_df_clean["player team"] == foul_df_clean["away"]) & (foul_df_clean["season"] == season)]    
    home_season_fav = foul_df_clean[(foul_df_clean["opponent team"] == foul_df_clean["home"]) & (foul_df_clean["season"] == season)]
    away_season_fav = foul_df_clean[(foul_df_clean["opponent team"] == foul_df_clean["away"]) & (foul_df_clean["season"] == season)]
    
    # Count observations where correct decision is 0 (incorrect) for the home team
    count_incorrect_home = len(home_season_inc[(home_season_inc["correct decision"] == 0)])
    # Do the same for the away team
    count_incorrect_away = len(away_season_inc[(away_season_inc["correct decision"] == 0)])
    
    # Count observations where correct decision is 0 (incorrect) but favors the home team
    count_favorable_home = len(home_season_fav[(home_season_fav["correct decision"] == 0)])
    # Do the same for the away team
    count_favorable_away = len(away_season_fav[(away_season_fav["correct decision"] == 0)])
    
    # Store the counts for the current season and move onto the next season
    # The counts are represented as a tuple (# incorrect, # favorable incorrect)
    home_away_counts["Home"][season] = (count_incorrect_home, count_favorable_home)
    home_away_counts["Away"][season] = (count_incorrect_away, count_favorable_away)

for team, season_counts in home_away_counts.items():
    print(f"{team}")
    for season, (inc_count, fav_count) in season_counts.items():
        print(f"{season} Season: {inc_count} incorrect calls, {fav_count} favorable calls")

Home
2015 Season: 127 incorrect calls, 87 favorable calls
2016 Season: 388 incorrect calls, 264 favorable calls
2017 Season: 267 incorrect calls, 209 favorable calls
2018 Season: 313 incorrect calls, 228 favorable calls
2019 Season: 371 incorrect calls, 325 favorable calls
2020 Season: 259 incorrect calls, 293 favorable calls
2021 Season: 235 incorrect calls, 244 favorable calls
2022 Season: 349 incorrect calls, 322 favorable calls
2023 Season: 365 incorrect calls, 334 favorable calls
2024 Season: 227 incorrect calls, 254 favorable calls
Away
2015 Season: 125 incorrect calls, 95 favorable calls
2016 Season: 369 incorrect calls, 304 favorable calls
2017 Season: 320 incorrect calls, 185 favorable calls
2018 Season: 280 incorrect calls, 248 favorable calls
2019 Season: 351 incorrect calls, 343 favorable calls
2020 Season: 296 incorrect calls, 255 favorable calls
2021 Season: 248 incorrect calls, 228 favorable calls
2022 Season: 328 incorrect calls, 346 favorable calls
2023 Season: 338 incorrect calls, 365 favorable calls
2024 Season: 254 incorrect calls, 226 favorable calls

# Create a dataframe from the team_season_incorrect_counts dictionary and transpose 
team_season_incorrect_df = pd.DataFrame(team_season_incorrect_counts)
team_season_incorrect_df = team_season_incorrect_df.T

# Create a dataframe from the team_season_favorable_counts dictionary and transpose 
team_season_favorable_df = pd.DataFrame(team_season_favorable_counts)
team_season_favorable_df = team_season_favorable_df.T

# Calculate the average number of incorrect and favorable incorrect calls per season
avg_incorrect_calls = team_season_incorrect_df.mean(axis=0)
avg_favorable_calls = team_season_favorable_df.mean(axis=0)

# Plotting
fig, axs = plt.subplots(2, 1, figsize=(12, 14))

# Plot for number of incorrect calls against each team over seasons
for team in team_season_incorrect_df.index:
    axs[0].plot(team_season_incorrect_df.columns, team_season_incorrect_df.loc[team], label=team)

# Plot the average line for incorrect calls
axs[0].plot(team_season_incorrect_df.columns, avg_incorrect_calls, label="Average", color="black", linewidth=2, linestyle="--")

axs[0].set_title("Number of Incorrect Calls Against Each Team Over Seasons")
axs[0].set_xlabel("Season")
axs[0].set_ylabel("Number of Incorrect Calls")
axs[0].grid(True)
axs[0].legend(loc="center left", bbox_to_anchor=(1, 0.5))

# Plot for number of favorable incorrect calls for each team over seasons
for team in team_season_favorable_df.index:
    axs[1].plot(team_season_favorable_df.columns, team_season_favorable_df.loc[team], label=team)

# Plot the average line for favorable calls
axs[1].plot(team_season_favorable_df.columns, avg_favorable_calls, label="Average", color="black", linewidth=2, linestyle="--")

axs[1].set_title("Number of Favorable Incorrect Calls For Each Team Over Seasons")
axs[1].set_xlabel("Season")
axs[1].set_ylabel("Number of Favorable Incorrect Calls")
axs[1].grid(True)
axs[1].legend(loc="center left", bbox_to_anchor=(1, 0.5))

plt.tight_layout()
plt.show()

# Get number of favorable and incorrect calls by season for home and away teams
seasons = sorted(list(home_away_counts["Home"].keys()))
home_incorrect_counts = [home_away_counts["Home"][season][0] for season in seasons]
away_incorrect_counts = [home_away_counts["Away"][season][0] for season in seasons]
home_favorable_counts = [home_away_counts["Home"][season][1] for season in seasons]
away_favorable_counts = [home_away_counts["Away"][season][1] for season in seasons]

# Plotting
fig, axs = plt.subplots(2, 1, figsize=(12, 10))

# Plot for number of incorrect calls over seasons for the home and away teams
axs[0].plot(seasons, home_incorrect_counts, label="Home")
axs[0].plot(seasons, away_incorrect_counts, label="Away")
axs[0].set_title("Number of Incorrect Calls Over Seasons for Home and Away Teams")
axs[0].set_xlabel("Season")
axs[0].set_ylabel("Number of Incorrect Calls")
axs[0].legend()
axs[0].grid(True)

# Plot for number of favorable calls over seasons for the home and away teams
axs[1].plot(seasons, home_favorable_counts, label="Home")
axs[1].plot(seasons, away_favorable_counts, label="Away")
axs[1].set_title("Number of Favorable Calls Over Seasons for Home and Away Teams")
axs[1].set_xlabel("Season")
axs[1].set_ylabel("Number of Favorable Calls")
axs[1].legend()
axs[1].grid(True)

plt.tight_layout()
plt.show()

# Group by official and count the total number of calls made by each official
total_calls_by_official = foul_df_clean.groupby("official").size()

# Then get all of the incorrect calls
incorrect_calls_df = foul_df_clean[foul_df_clean["correct decision"] == 0]

# Group by official and count their incorrect calls
incorrect_calls_by_official = incorrect_calls_df.groupby("official").size()

# Calculate accuracy (proportion of incorrect calls)
accuracy_by_official = (total_calls_by_official - incorrect_calls_by_official) / total_calls_by_official

# Calculate mean call accuracy and standard deviation
mean_accuracy = accuracy_by_official.mean()
std_accuracy = accuracy_by_official.std()

# Calculate number of standard deviations from the mean (z-score) for each official
z_score = (accuracy_by_official - mean_accuracy) / std_accuracy

# Sort officials based on total incorrect calls
total_calls_by_official = total_calls_by_official.sort_values()

# Sort officials based on number of standard deviations from the mean accuracy
z_score = z_score.sort_values()

# Create a figure and two subplots
fig, axes = plt.subplots(2, 1, figsize=(10, 12))

# Plot total incorrect calls by official
axes[0].bar(total_calls_by_official.index, total_calls_by_official, color="lightblue")
axes[0].set_title("Total Incorrect Calls by Official")
axes[0].set_xlabel("Official")
axes[0].set_ylabel("Number of Incorrect Calls")
axes[0].tick_params(axis="x", rotation=90)
axes[0].tick_params(axis="x", labelsize=6)  # Adjust label size

# Plot number of standard deviations from mean accuracy by official
axes[1].bar(z_score.index, z_score, color="orange")
axes[1].axhline(0, color="gray", linestyle="--", linewidth=0.5)  # Add horizontal line at y=0
axes[1].set_title("Number of Standard Deviations from Mean Accuracy by Official")
axes[1].set_xlabel("Official")
axes[1].set_ylabel("Standard Deviations from Mean Accuracy")
axes[1].tick_params(axis="x", rotation=90)
axes[1].tick_params(axis="x", labelsize=6)  # Adjust label size

plt.tight_layout()
plt.show()

# Filter out rows in the last two minutes
l2m = foul_df_clean[foul_df_clean["seconds"] <= 120]

# Count the number of incorrect calls in the last two minutes
incorrect_calls_l2m = l2m[l2m["correct decision"] == 0]

# Plotting
plt.figure(figsize=(8, 6))
incorrect_calls_l2m["seconds"].hist(bins=24, color="skyblue", edgecolor="black")
plt.xlabel("Seconds Remaining")
plt.ylabel("Number of Incorrect Calls")
plt.title("Number of Incorrect Calls in the Last Two Minutes")
plt.grid(axis="y")
plt.show()

# Calculate accuracy for each type of call
accuracy_by_type = foul_df_clean.groupby("type")["correct decision"].mean().sort_values()

# Plot the accuracy for each type of call
plt.figure(figsize=(10, 6))
sns.barplot(x=accuracy_by_type.index,y=accuracy_by_type.values)
plt.title("Accuracy of Calls by Type")
plt.xlabel("Type of Call")
plt.ylabel("Accuracy")
plt.tick_params(axis="x", rotation=90, labelsize=6)
plt.show()

# Create a new dataset which includes the features we want to test on
# Our independent variables (predictors) are season, seconds remaining, type of foul called, and the official
# Our dependent variable (response) is the correctness of the decision
foul_df_final = foul_df_clean[["season", "seconds", "type", "official", "correct decision"]].copy()

# Standardize the seconds column
mean_seconds = np.mean(foul_df_final["seconds"])
std_seconds = np.std(foul_df_final["seconds"])
foul_df_final["seconds"] = (foul_df_final["seconds"] - mean_seconds) / std_seconds

# One hot encode seasons and official names
foul_df_final = pd.get_dummies(foul_df_final, columns=["season", "type", "official"], prefix=["season", "type", "official"])

foul_df_final.head()

# Count the occurrences of each value in the correct decision column
correct_decision_counts = foul_df_final["correct decision"].value_counts()
print(correct_decision_counts)

correct decision
1    74543
0     5837
Name: count, dtype: int64

# Prepare the data for a train test split
# Our predictors are season and seconds remaining, 
# Our response is a correct decision (1) or incorrect decision (0)
X = foul_df_final.drop(columns=["correct decision"])  # All columns except the target column
y = foul_df_final["correct decision"]  # Target column
# Test size = the proportion of the dataset to include in the test split
# Random state = controls the shuffling of the data before the split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = LogisticRegression(max_iter=2000, class_weight="balanced")
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Get the coefficients for the predictors
coefficients = model.coef_[0]
top_indices = np.argsort(np.abs(coefficients))[::-1][:5]
for index in top_indices:
    feature = X_train.columns[index]
    coefficient = coefficients[index]
    print(f"Feature: {feature}, Coefficient: {coefficient}")

Feature: type_SUPPORT RULING, Coefficient: 4.351317369954331
Feature: type_OVERTURN RULING, Coefficient: 3.5367877183853538
Feature: type_PERSONAL TAKE, Coefficient: 3.358119584102903
Feature: type_OUT OF BOUNDS - BAD PASS TURN, Coefficient: 2.58783960803185
Feature: official_Sha'Rae Mitchell, Coefficient: 2.2084677967373034

# Evaluate the model using a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
# Plot confusion matrix as a heatmap
plt.figure(figsize=(4, 2))
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt="g", cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

Accuracy: 0.7515551132122419
Precision: 0.9500537501033656
Recall: 0.7721621076685261
F1 Score: 0.8519205101586831

# Get predicted probabilities for the positive class
y_probs = model.predict_proba(X_test)[:, 1]

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_probs, pos_label=1)

# Calculate AUC (Area Under the Curve)
auc = roc_auc_score(y_test, y_probs)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="blue", label=f"ROC Curve (AUC = {auc:.2f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--", lw=1)
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Initialize and train the Random Forest model
model_rf = RandomForestClassifier(random_state=42, class_weight="balanced")
model_rf.fit(X_train, y_train)

# Predictions
y_pred_rf = model_rf.predict(X_test)

# Evaluate the model using a confusion matrix
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

# Plot confusion matrix as a heatmap
plt.figure(figsize=(4, 2))
sns.heatmap(conf_matrix_rf, annot=True, cmap="Blues", fmt="g", cbar=False)
plt.title("Confusion Matrix (Random Forest)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Calculate accuracy, precision, recall, and F1-score
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
print("Accuracy (Random Forest):", accuracy_rf)
print("Precision (Random Forest):", precision_rf)
print("Recall (Random Forest):", recall_rf)
print("F1 Score (Random Forest):", f1_rf)

# Get predicted probabilities for the positive class
y_probs_rf = model_rf.predict_proba(X_test)[:, 1]

# Calculate ROC curve and AUC
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_probs_rf, pos_label=1)
auc_rf = roc_auc_score(y_test, y_probs_rf)

# Plot ROC curve
plt.figure(figsize=(6, 4))
plt.plot(fpr_rf, tpr_rf, color="blue", label=f"ROC Curve (AUC = {auc_rf:.2f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--", lw=1)
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Get the feature importances
feature_importances_rf = model_rf.feature_importances_

# Sort the features based on their importance
sorted_indices_rf = np.argsort(feature_importances_rf)[::-1]
top_features_rf = X_train.columns[sorted_indices_rf][:5]
top_importances_rf = feature_importances_rf[sorted_indices_rf][:5]

# Plot the top 5 features
plt.figure(figsize=(6, 4))
plt.barh(top_features_rf, top_importances_rf, color="skyblue")
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Top 5 Feature Importances")
plt.gca().invert_yaxis()  
plt.show()

Accuracy (Random Forest): 0.8903956208011943
Precision (Random Forest): 0.9306586118589533
Recall (Random Forest): 0.9525505746353922
F1 Score (Random Forest): 0.9414773482130995

	period	time	call_type	committing	disadvantaged	decision	comments	game_details	page	file	...	committing_min	committing_team	committing_side	disadvantaged_min	disadvantaged_team	disadvantaged_side	type2	time_min	time_sec	time2
0	Q4	01:52.0	Foul: Shooting	Josh Smith	Kevin Love	CNC	Smith (HOU) does not make contact with Love (C...	Cavaliers @ Rockets (Mar 01, 2015)	1.0	L2M-CLE-HOU-3-1-15.pdf	...	30.950000	HOU	home	35.016667	CLE	away	SHOOTING	1	52.0	1.866667
1	Q4	01:43.0	Foul: Shooting	JR Smith	James Harden	CC	Smith (CLE) makes contact with the body of Har...	Cavaliers @ Rockets (Mar 01, 2015)	1.0	L2M-CLE-HOU-3-1-15.pdf	...	50.800000	CLE	away	42.350000	HOU	home	SHOOTING	1	43.0	1.716667
2	Q4	01:32.0	Foul: Shooting	Trevor Ariza	LeBron James	CC	Ariza (HOU) makes contact with the shoulder of...	Cavaliers @ Rockets (Mar 01, 2015)	1.0	L2M-CLE-HOU-3-1-15.pdf	...	42.233333	HOU	home	42.100000	CLE	away	SHOOTING	1	32.0	1.533333
3	Q4	01:09.0	Foul: Loose Ball	Terrence Jones	Tristan Thompson	CC	Jones (HOU) makes contact with the arm of Thom...	Cavaliers @ Rockets (Mar 01, 2015)	1.0	L2M-CLE-HOU-3-1-15.pdf	...	44.016667	HOU	home	37.316667	CLE	away	LOOSE BALL	1	9.0	1.150000
4	Q4	00:53.0	Foul: Shooting	Tristan Thompson	Josh Smith	CNC	Smith (HOU) loses the ball as he goes up for t...	Cavaliers @ Rockets (Mar 01, 2015)	1.0	L2M-CLE-HOU-3-1-15.pdf	...	37.316667	CLE	away	30.950000	HOU	home	SHOOTING	0	53.0	0.883333

Analysis of NBA Officiating¶

David Han¶

May 17, 2024¶

Introduction¶

Data Collection¶

Imports¶

Get Foul Data from 2015-2024¶

Data Processing¶

Cleaning the Foul Data¶

Remove NaN Observations and Add a New Column to Assess Correct and Incorrect Calls¶

Create a New Column for Seconds Remaining¶

Exploratory Analysis and Data Visualization¶

Plot Accuracy of Foul Calls Over Time¶

Count the Number of Incorrect Calls Against Each Team for Each Season¶

Count the Number of Incorrect Calls That Favor Each Team for Each Season¶

Repeat the Previous Two Steps for Home and Away Teams¶

Plot the Data¶

Plot the Number of Incorrect Calls and Call Accuracy by Official¶

Plot the Number of Incorrect Calls Over Remaining Game Time¶

Plot the Accuracy of Calls for Each Type of Foul Call¶

Model Analysis, Hypothesis Testing, Machine Learning¶

Preprocess Data¶

Checking for and Handling Target Class Imbalance¶

Split Data into Train and Test Splits¶

Logistic Regression: Predicting/Classifying Incorrect Calls¶

Testing our Model¶

Plotting the ROC Curve¶

Comparing the Results to a Random Forest Classifier¶

Insight¶

	date	time	season	home	away	type	player	opponent	decision	player team	opponent team	official
0	2015-03-01	01:52.0	2015	Rockets	Cavaliers	SHOOTING	Josh Smith	Kevin Love	CNC	Rockets	Cavaliers	Dan Crawford
1	2015-03-01	01:43.0	2015	Rockets	Cavaliers	SHOOTING	JR Smith	James Harden	CC	Cavaliers	Rockets	Dan Crawford
2	2015-03-01	01:32.0	2015	Rockets	Cavaliers	SHOOTING	Trevor Ariza	LeBron James	CC	Rockets	Cavaliers	Dan Crawford
3	2015-03-01	01:09.0	2015	Rockets	Cavaliers	LOOSE BALL	Terrence Jones	Tristan Thompson	CC	Rockets	Cavaliers	Dan Crawford
4	2015-03-01	00:53.0	2015	Rockets	Cavaliers	SHOOTING	Tristan Thompson	Josh Smith	CNC	Cavaliers	Rockets	Dan Crawford

	seconds	correct decision	season_2015	season_2016	season_2017	season_2018	season_2019	season_2020	season_2021	season_2022	...	official_Suyash Mehta	official_Tom Washington	official_Tony Brothers	official_Tre Maddox	official_Tyler Ford	official_Tyler Mirkovich	official_Tyler Ricks	official_Violet Palmer	official_Vladimir Voyard-Tadal	official_Zach Zarba
0	1.452359	1	True	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
1	1.230863	1	True	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
2	0.960147	1	True	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
3	0.394103	1	True	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
4	0.000334	1	True	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False