Given that we’ve been doing some work with the NBA and “blowouts” in a couple of different posts—one here and another here—I thought we might look at the WNBA too. So, first step, I found a Kaggle dataset that gave us data up until 2020, if I remember correctly. We then wrote some code to update to the most recent games:

import pandas as pd
import numpy as np
from nba_api.stats.endpoints import leaguegamelog
from datetime import datetime
import time

# --- Configuration ---
ORIGINAL_CSV = "1997-2020_officialBoxScore.csv"
UPDATED_CSV = "1997-2024_officialBoxScore.csv"
START_YEAR = 2020
END_YEAR = 2024

# Load the existing dataset
print(f"Loading existing dataset: {ORIGINAL_CSV}")
df_existing = pd.read_csv(ORIGINAL_CSV)

# List to store new game data
new_games = []

print(f"Fetching WNBA data from {START_YEAR} to {END_YEAR} using nba_api...")
for year in range(START_YEAR, END_YEAR + 1):
    season_str = f"{year}-{str(year+1)[2:]}"
    print(f"Fetching data for season: {season_str}")

    try:
        # Fetch league game log for the season
        game_logs = leaguegamelog.LeagueGameLog(
            league_id='10',  # WNBA League ID
            season=season_str,
            season_type_all_star='Regular Season'
        ).get_data_frames()[0]

        if game_logs.empty:
            print(f"No game log data found for {season_str}.")
            continue

        # Debug: Print available columns
        print(f"Available columns for {season_str}: {game_logs.columns.tolist()}")

        # Group by game_id to process each game
        games = game_logs.groupby('GAME_ID')
        for game_id, game_data in games:
            # Each game should have two teams
            if len(game_data) != 2:
                print(f"Skipping game {game_id}: Expected 2 teams, found {len(game_data)}")
                continue

            team1_data = game_data.iloc[0]
            team2_data = game_data.iloc[1]

            # Determine home and away teams (approximate based on MATCHUP)
            if 'MATCHUP' in team1_data and '@' in team1_data['MATCHUP']:
                away_team = team1_data
                home_team = team2_data
            else:
                home_team = team1_data
                away_team = team2_data

            # Game date
            game_date = pd.to_datetime(team1_data['GAME_DATE']).strftime('%Y-%m-%d')

            # Determine winner
            home_points = home_team['PTS']
            away_points = away_team['PTS']
            if home_points > away_points:
                home_result = 'Win'
                away_result = 'Loss'
                winner = home_team['TEAM_ABBREVIATION']
            else:
                home_result = 'Loss'
                away_result = 'Win'
                winner = away_team['TEAM_ABBREVIATION']

            # Calculate team possessions (simplified)
            def calculate_possessions(fga, to, fta):
                return fga + to + 0.4 * fta  # Simplified approximation

            # Map data to features for both teams
            for team_data, loc, rslt, oppt_data in [
                (home_team, 'Home', home_result, away_team),
                (away_team, 'Away', away_result, home_team)
            ]:
                fga = team_data['FGA']
                fgm = team_data['FGM']
                fg_pct = (fgm / fga * 100) if fga > 0 else 0
                two_pa = team_data['FGA'] - team_data.get('FG3A', 0)
                two_pm = two_pa - (team_data.get('FG3A', 0) - team_data.get('FG3M', 0))
                two_p_pct = (two_pm / two_pa * 100) if two_pa > 0 else 0
                three_pa = team_data.get('FG3A', 0)
                three_pm = team_data.get('FG3M', 0)
                three_p_pct = (three_pm / three_pa * 100) if three_pa > 0 else 0
                fta = team_data['FTA']
                ftm = team_data['FTM']
                ft_pct = (ftm / fta * 100) if fta > 0 else 0
                orb = team_data.get('OREB', 0)
                drb = team_data.get('DREB', 0)
                trb = orb + drb
                to = team_data.get('TO', 0)  # Use 'TO' instead of 'TURNOVERS'
                ast = team_data['AST']
                stl = team_data['STL']
                blk = team_data['BLK']
                pf = team_data['PF']
                pts = team_data['PTS']
                min_played = 240  # Approximate as 48 minutes * 5 players

                # Opponent stats
                oppt_fga = oppt_data['FGA']
                oppt_fgm = oppt_data['FGM']
                oppt_fg_pct = (oppt_fgm / oppt_fga * 100) if oppt_fga > 0 else 0
                oppt_two_pa = oppt_fga - oppt_data.get('FG3A', 0)
                oppt_two_pm = oppt_two_pa - (oppt_data.get('FG3A', 0) - oppt_data.get('FG3M', 0))
                oppt_two_p_pct = (oppt_two_pm / oppt_two_pa * 100) if oppt_two_pa > 0 else 0
                oppt_three_pa = oppt_data.get('FG3A', 0)
                oppt_three_pm = oppt_data.get('FG3M', 0)
                oppt_three_p_pct = (oppt_three_pm / oppt_three_pa * 100) if oppt_three_pa > 0 else 0
                oppt_fta = oppt_data['FTA']
                oppt_ftm = oppt_data['FTM']
                oppt_ft_pct = (oppt_ftm / oppt_fta * 100) if oppt_fta > 0 else 0
                oppt_orb = oppt_data.get('OREB', 0)
                oppt_drb = oppt_data.get('DREB', 0)
                oppt_trb = oppt_orb + oppt_drb
                oppt_to = oppt_data.get('TO', 0)
                oppt_ast = oppt_data['AST']
                oppt_stl = oppt_data['STL']
                oppt_blk = oppt_data['BLK']
                oppt_pf = oppt_data['PF']
                oppt_pts = oppt_data['PTS']

                # Advanced stats calculations
                team_poss = calculate_possessions(fga, to, fta)
                oppt_poss = calculate_possessions(oppt_fga, oppt_to, oppt_fta)

                treb_pct = (trb * 100) / (trb + oppt_trb) if (trb + oppt_trb) > 0 else 0
                asst_pct = (ast / fgm) * 100 if fgm > 0 else 0
                ts_pct = pts / (2 * (fga + (fta * 0.44))) * 100 if (fga + (fta * 0.44)) > 0 else 0
                efg_pct = (fgm + (three_pm / 2)) / fga * 100 if fga > 0 else 0
                oreb_pct = (orb * 100) / (orb + oppt_drb) if (orb + oppt_drb) > 0 else 0
                dreb_pct = (drb * 100) / (drb + oppt_orb) if (drb + oppt_orb) > 0 else 0
                to_pct = (to * 100) / (fga + 0.44 * fta + to) if (fga + 0.44 * fta + to) > 0 else 0
                stl_pct = (stl * 100) / team_poss if team_poss > 0 else 0
                blk_pct = (blk * 100) / team_poss if team_poss > 0 else 0
                blkr = (blk * 100) / oppt_two_pa if oppt_two_pa > 0 else 0
                pps = pts / fga if fga > 0 else 0
                fic = pts + orb + (0.75 * drb) + ast + stl + blk - (0.75 * fga) - (0.375 * fta) - to - (0.5 * pf)
                fic40 = (fic * 40 * 5) / min_played if min_played > 0 else 0
                ortg = (pts * 100) / team_poss if team_poss > 0 else 0
                drtg = (oppt_pts * 100) / team_poss if team_poss > 0 else 0
                ediff = ortg - drtg
                play_pct = fgm / (fga - orb + to) if (fga - orb + to) > 0 else 0
                ar = (ast * 100) / (fga - 0.44 * fta + ast + to) if (fga - 0.44 * fta + ast + to) > 0 else 0
                ast_to = ast / to if to > 0 else 0
                pace = (team_poss * 48 * 5) / min_played if min_played > 0 else 0
                stl_to = stl / to if to > 0 else 0

                # Opponent advanced stats
                oppt_treb_pct = (oppt_trb * 100) / (oppt_trb + trb) if (oppt_trb + trb) > 0 else 0
                oppt_asst_pct = (oppt_ast / oppt_fgm) * 100 if oppt_fgm > 0 else 0
                oppt_ts_pct = oppt_pts / (2 * (oppt_fga + (oppt_fta * 0.44))) * 100 if (oppt_fga + (oppt_fta * 0.44)) > 0 else 0
                oppt_efg_pct = (oppt_fgm + (oppt_three_pm / 2)) / oppt_fga * 100 if oppt_fga > 0 else 0
                oppt_oreb_pct = (oppt_orb * 100) / (oppt_orb + drb) if (oppt_orb + drb) > 0 else 0
                oppt_dreb_pct = (oppt_drb * 100) / (oppt_drb + orb) if (oppt_drb + orb) > 0 else 0
                oppt_to_pct = (oppt_to * 100) / (oppt_fga + 0.44 * oppt_fta + oppt_to) if (oppt_fga + 0.44 * oppt_fta + oppt_to) > 0 else 0
                oppt_stl_pct = (oppt_stl * 100) / oppt_poss if oppt_poss > 0 else 0
                oppt_blk_pct = (oppt_blk * 100) / oppt_poss if oppt_poss > 0 else 0
                oppt_blkr = (oppt_blk * 100) / two_pa if two_pa > 0 else 0
                oppt_pps = oppt_pts / oppt_fga if oppt_fga > 0 else 0
                oppt_fic = oppt_pts + oppt_orb + (0.75 * oppt_drb) + oppt_ast + oppt_stl + oppt_blk - (0.75 * oppt_fga) - (0.375 * oppt_fta) - oppt_to - (0.5 * oppt_pf)
                oppt_fic40 = (oppt_fic * 40 * 5) / min_played if min_played > 0 else 0
                oppt_ortg = (oppt_pts * 100) / oppt_poss if oppt_poss > 0 else 0
                oppt_drtg = (pts * 100) / oppt_poss if oppt_poss > 0 else 0
                oppt_ediff = oppt_ortg - oppt_drtg
                oppt_play_pct = oppt_fgm / (oppt_fga - oppt_orb + oppt_to) if (oppt_fga - oppt_orb + oppt_to) > 0 else 0
                oppt_ar = (oppt_ast * 100) / (oppt_fga - 0.44 * oppt_fta + oppt_ast + oppt_to) if (oppt_fga - 0.44 * oppt_fta + oppt_ast + oppt_to) > 0 else 0
                oppt_ast_to = oppt_ast / oppt_to if oppt_to > 0 else 0
                oppt_pace = (oppt_poss * 48 * 5) / min_played if min_played > 0 else 0
                oppt_stl_to = oppt_stl / oppt_to if oppt_to > 0 else 0

                game_record = {
                    'gmDate': game_date,
                    'seasonType': 'Regular',
                    'season': season_str,
                    'teamWins': 0,
                    'teamLosses': 0,
                    'teamAbbr': team_data['TEAM_ABBREVIATION'],
                    'teamLoc': loc,
                    'teamRslt': rslt,
                    'teamDayOff': 0,
                    'teamPTS': pts,
                    'teamAST': ast,
                    'teamTO': to,
                    'teamMin': min_played,
                    'teamSTL': stl,
                    'teamBLK': blk,
                    'teamPF': pf,
                    'teamFGA': fga,
                    'teamFGM': fgm,
                    'teamFG%': fg_pct,
                    'team2PA': two_pa,
                    'team2PM': two_pm,
                    'team2P%': two_p_pct,
                    'team3PA': three_pa,
                    'team3PM': three_pm,
                    'team3P%': three_p_pct,
                    'teamFTA': fta,
                    'teamFTM': ftm,
                    'teamFT%': ft_pct,
                    'teamORB': orb,
                    'teamDRB': drb,
                    'teamTRB': trb,
                    'teamTREB%': treb_pct,
                    'teamASST%': asst_pct,
                    'teamTS%': ts_pct,
                    'teamEFG%': efg_pct,
                    'teamOREB%': oreb_pct,
                    'teamDREB%': dreb_pct,
                    'teamTO%': to_pct,
                    'teamSTL%': stl_pct,
                    'teamBLK%': blk_pct,
                    'teamBLKR': blkr,
                    'teamPPS': pps,
                    'teamFIC': fic,
                    'teamFIC40': fic40,
                    'teamOrtg': ortg,
                    'teamDrtg': drtg,
                    'teamEDiff': ediff,
                    'teamPlay%': play_pct,
                    'teamAR': ar,
                    'teamPoss': team_poss,
                    'teamAST/TO': ast_to,
                    'teamPace': pace,
                    'teamSTL/TO': stl_to,
                    'opptWins': 0,
                    'opptLosses': 0,
                    'opptAbbr': oppt_data['TEAM_ABBREVIATION'],
                    'opptLoc': 'Away' if loc == 'Home' else 'Home',
                    'opptRslt': 'Loss' if rslt == 'Win' else 'Win',
                    'opptDayOff': 0,
                    'opptPTS': oppt_pts,
                    'opptAST': oppt_ast,
                    'opptTO': oppt_to,
                    'opptMin': min_played,
                    'opptSTL': oppt_stl,
                    'opptBLK': oppt_blk,
                    'opptPF': oppt_pf,
                    'opptFGA': oppt_fga,
                    'opptFGM': oppt_fgm,
                    'opptFG%': oppt_fg_pct,
                    'oppt2PA': oppt_two_pa,
                    'oppt2PM': oppt_two_pm,
                    'oppt2P%': oppt_two_p_pct,
                    'oppt3PA': oppt_three_pa,
                    'oppt3PM': oppt_three_pm,
                    'oppt3P%': oppt_three_p_pct,
                    'opptFTA': oppt_fta,
                    'opptFTM': oppt_ftm,
                    'opptFT%': ft_pct,
                    'opptORB': oppt_orb,
                    'opptDRB': oppt_drb,
                    'opptTRB': oppt_trb,
                    'opptTREB%': oppt_treb_pct,
                    'opptASST%': oppt_asst_pct,
                    'opptTS%': oppt_ts_pct,
                    'opptEFG%': oppt_efg_pct,
                    'opptOREB%': oppt_oreb_pct,
                    'opptDREB%': oppt_dreb_pct,
                    'opptTO%': oppt_to_pct,
                    'opptSTL%': oppt_stl_pct,
                    'opptBLK%': oppt_blk_pct,
                    'opptBLKR': oppt_blkr,
                    'opptPPS': oppt_pps,
                    'opptFIC': oppt_fic,
                    'opptFIC40': oppt_fic40,
                    'opptOrtg': oppt_ortg,
                    'opptDrtg': oppt_drtg,
                    'opptEDiff': oppt_ediff,
                    'opptPlay%': oppt_play_pct,
                    'opptAR': oppt_ar,
                    'opptPoss': oppt_poss,
                    'opptAST/TO': oppt_ast_to,
                    'opptPace': oppt_pace,
                    'opptSTL/TO': oppt_stl_to,
                    'matchWinner': winner
                }
                new_games.append(game_record)

        print(f"Processed {len(games)} games for season {season_str}.")
        time.sleep(1)  # Respect API rate limits

    except Exception as e:
        print(f"Error fetching data for season {season_str}: {e}")
        continue

# Create a DataFrame for new data
if new_games:
    df_new = pd.DataFrame(new_games)
    # Ensure columns match the existing dataset
    df_new = df_new[df_existing.columns]
    # Combine datasets
    df_combined = pd.concat([df_existing, df_new], ignore_index=True)
    # Save the updated dataset
    df_combined.to_csv(UPDATED_CSV, index=False)
    print(f"Updated dataset saved as {UPDATED_CSV} with {len(df_combined)} rows.")
else:
    print("No new data was collected.")

Now, with the updated dataset, we can write some code to plot the blowouts and their proportions over time:

blowout_counts

blowout_proportions

Script to produce these plots is here:

import pandas as pd
import matplotlib.pyplot as plt

# Read the CSV file (replace 'wnba_games.csv' with your actual file path)
df = pd.read_csv('1997-2025_officialBoxScore.csv')

# Calculate margin of victory
df['margin'] = abs(df['teamPTS'] - df['opptPTS'])

# Identify blowout games (margin > 20)
df['is_blowout'] = df['margin'] > 20

# Group by season to get blowout counts and total games
blowout_counts = df.groupby('season').agg({
    'is_blowout': 'sum',  # Count of blowout games
    'gmDate': 'count'     # Total games
}).rename(columns={'gmDate': 'total_games', 'is_blowout': 'blowout_games'})

# Calculate proportion of blowout games
blowout_counts['blowout_proportion'] = blowout_counts['blowout_games'] / blowout_counts['total_games']

# Plot 1: Number of blowout games per season
plt.figure(figsize=(10, 6))
plt.plot(blowout_counts.index, blowout_counts['blowout_games'], marker='o')
plt.title('Number of WNBA Blowout Games (Margin > 20) per Season')
plt.xlabel('Season')
plt.ylabel('Number of Blowout Games')
plt.grid(True)
plt.xticks(blowout_counts.index, rotation=45)
plt.tight_layout()
plt.savefig('wnba_blowout_counts.png')
plt.close()

# Plot 2: Proportion of blowout games per season
plt.figure(figsize=(10, 6))
plt.plot(blowout_counts.index, blowout_counts['blowout_proportion'], marker='o', color='orange')
plt.title('Proportion of WNBA Blowout Games (Margin > 20) per Season')
plt.xlabel('Season')
plt.ylabel('Proportion of Blowout Games')
plt.grid(True)
plt.xticks(blowout_counts.index, rotation=45)
plt.tight_layout()
plt.savefig('wnba_blowout_proportions.png')
plt.close()

Perhaps we could overlay the graphs with the NBA—maybe a project for another time.