{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import math\n", "import sys\n", "pd.set_option(\"display.max_columns\",50)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# download data from here: https://s3.amazonaws.com/pbpstats/db_dumps/raw_shots.csv.zip and put it in working directory\n", "all_shots = pd.read_csv('raw_shots.csv', dtype={'GameId': str})" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "all_shots = all_shots[~(all_shots.X.isna())]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# fill in missing putback column with False\n", "all_shots.Putback.fillna(False, inplace=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def is_regular_season(row):\n", " if row['GameId'][2] == '2':\n", " return True\n", " return False\n", "\n", "all_shots['IsRegularSeason'] = all_shots.apply(is_regular_season, axis=1)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def get_season(row):\n", " # 2017-18 season will be 17, 2016-17 will be 16...\n", " # 2017-18 season for gleague will be 217\n", " # 2018 season for wnba will be 118\n", " return int(row['GameId'][0] + row['GameId'][3:5])\n", "\n", "all_shots['Season'] = all_shots.apply(get_season, axis=1)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# add play start type - start type for possessions with no oreb, missed shot type for shots after orebs\n", "def get_play_start_type(row):\n", " if row['OrebShotType'] is np.nan:\n", " return row['StartType']\n", " elif row['OrebShotType'] in ['Arc3Blocked', 'Corner3Blocked']:\n", " # combine off blocked 3 orebs due to sample size issues\n", " return 'Off3BlockedOreb'\n", " elif row['OrebShotType'] in ['Corner3', 'Arc3']:\n", " # combine off 3 orebs because model does better when they are combined\n", " return 'Off3Oreb'\n", " else:\n", " return 'Off' + row['OrebShotType'] + 'Oreb'\n", "\n", "all_shots['PlayStartType'] = all_shots.apply(get_play_start_type, axis=1) " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def get_time_since_play_started(row):\n", " if row['OrebShotType'] is np.nan:\n", " return row['StartTime'] - row['Time']\n", " else:\n", " # use time since oreb if off oreb\n", " return row['SecondsSinceOReb']\n", "\n", "all_shots['SecondsSincePlayStarted'] = all_shots.apply(get_time_since_play_started, axis=1) " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def get_shot_distance(row):\n", " x_squared = row['X'] ** 2\n", " y_squared = row['Y'] ** 2\n", " shot_distance = math.sqrt(x_squared + y_squared) / 10 # unit for distance is off by factor of 10, divide by 10 to convert to feet\n", " return round(shot_distance, 1)\n", "\n", "all_shots['ShotDistance'] = all_shots.apply(get_shot_distance, axis=1) " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def get_shot_angle_from_centre(row):\n", " # 0 is straightaway, 90 is from the corners\n", " angle = abs(math.degrees(math.atan2(row['X'], row['Y'])))\n", " if angle < 90:\n", " return round(angle, 1)\n", " else:\n", " return round(180 - angle, 1)\n", "\n", "all_shots['ShotAngle'] = all_shots.apply(get_shot_angle_from_centre, axis=1) " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def make_all_ot_periods_5(row):\n", " # small sample size with Period > 5, make all OT periods 5\n", " if row['Period'] > 5:\n", " return 5\n", " else:\n", " return row['Period']\n", "\n", "all_shots['Period'] = all_shots.apply(make_all_ot_periods_5, axis=1)\n", "\n", "base_features = ['Season', 'Made', 'Period', 'StartScoreDifferential', 'ShotValue', 'Time', 'Putback', 'IsRegularSeason', 'PlayStartType', 'SecondsSincePlayStarted', 'ShotDistance', 'ShotAngle']\n", "all_shots = pd.get_dummies(all_shots[base_features], prefix='is')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# group together start types to see what level of detail on start type does better\n", "\n", "def combine_off_block_orebs(row):\n", " if row['is_Off3BlockedOreb'] + row['is_OffAtRimBlockedOreb'] + row['is_OffLongMidRangeBlockedOreb'] + row['is_OffShortMidRangeBlockedOreb'] == 1:\n", " return 1\n", " else:\n", " return 0\n", "\n", "def combine_off_blocks(row):\n", " if row['is_OffArc3Block'] + row['is_OffCorner3Block'] + row['is_OffAtRimBlock'] + row['is_OffLongMidRangeBlock'] + row['is_OffShortMidRangeBlock'] == 1:\n", " return 1\n", " else:\n", " return 0\n", "\n", "def combine_off_makes(row):\n", " if row['is_OffArc3Make'] + row['is_OffCorner3Make'] + row['is_OffAtRimMake'] + row['is_OffLongMidRangeMake'] + row['is_OffShortMidRangeMake'] == 1:\n", " return 1\n", " else:\n", " return 0\n", "\n", "\n", "def combine_off_orebs(row):\n", " if row['is_Off3Oreb'] + row['is_OffAtRimOreb'] + row['is_OffLongMidRangeOreb'] + row['is_OffShortMidRangeOreb'] == 1:\n", " return 1\n", " else:\n", " return 0\n", "\n", "def combine_off_missed_fg(row):\n", " if row['is_OffArc3Miss'] + row['is_OffCorner3Miss'] + row['is_OffAtRimMiss'] + row['is_OffLongMidRangeMiss'] + row['is_OffShortMidRangeMiss'] == 1:\n", " return 1\n", " else:\n", " return 0" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "all_shots['is_OffBlockedOreb'] = all_shots.apply(combine_off_block_orebs, axis=1)\n", "all_shots['is_OffBlock'] = all_shots.apply(combine_off_blocks, axis=1)\n", "all_shots['is_OffMadeFG'] = all_shots.apply(combine_off_makes, axis=1)\n", "all_shots['is_OffOreb'] = all_shots.apply(combine_off_orebs, axis=1)\n", "all_shots['is_OffMissedFG'] = all_shots.apply(combine_off_missed_fg, axis=1)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", " | Season | \n", "Made | \n", "Period | \n", "StartScoreDifferential | \n", "ShotValue | \n", "Time | \n", "Putback | \n", "IsRegularSeason | \n", "SecondsSincePlayStarted | \n", "ShotDistance | \n", "ShotAngle | \n", "is_Off3BlockedOreb | \n", "is_Off3Oreb | \n", "is_OffArc3Block | \n", "is_OffArc3Make | \n", "is_OffArc3Miss | \n", "is_OffAtRimBlock | \n", "is_OffAtRimBlockedOreb | \n", "is_OffAtRimMake | \n", "is_OffAtRimMiss | \n", "is_OffAtRimOreb | \n", "is_OffCorner3Block | \n", "is_OffCorner3Make | \n", "is_OffCorner3Miss | \n", "is_OffDeadball | \n", "is_OffFTMake | \n", "is_OffFTMiss | \n", "is_OffFTOreb | \n", "is_OffLiveBallTurnover | \n", "is_OffLongMidRangeBlock | \n", "is_OffLongMidRangeBlockedOreb | \n", "is_OffLongMidRangeMake | \n", "is_OffLongMidRangeMiss | \n", "is_OffLongMidRangeOreb | \n", "is_OffShortMidRangeBlock | \n", "is_OffShortMidRangeBlockedOreb | \n", "is_OffShortMidRangeMake | \n", "is_OffShortMidRangeMiss | \n", "is_OffShortMidRangeOreb | \n", "is_OffTeamBlockedOreb | \n", "is_OffTeamOreb | \n", "is_OffTimeout | \n", "is_OffBlockedOreb | \n", "is_OffBlock | \n", "is_OffMadeFG | \n", "is_OffOreb | \n", "is_OffMissedFG | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "7 | \n", "False | \n", "4 | \n", "19 | \n", "2 | \n", "299.0 | \n", "False | \n", "False | \n", "23.0 | \n", "21.7 | \n", "42.2 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "
1 | \n", "7 | \n", "True | \n", "4 | \n", "-19 | \n", "2 | \n", "281.0 | \n", "False | \n", "False | \n", "16.0 | \n", "0.0 | \n", "0.0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
2 | \n", "7 | \n", "True | \n", "4 | \n", "18 | \n", "2 | \n", "231.0 | \n", "False | \n", "False | \n", "16.0 | \n", "13.5 | \n", "64.0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
3 | \n", "7 | \n", "False | \n", "4 | \n", "-20 | \n", "3 | \n", "195.0 | \n", "False | \n", "False | \n", "7.0 | \n", "24.7 | \n", "61.5 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
4 | \n", "7 | \n", "False | \n", "4 | \n", "20 | \n", "2 | \n", "180.0 | \n", "False | \n", "False | \n", "14.0 | \n", "18.8 | \n", "74.9 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "