{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import math\n", "import sys\n", "pd.set_option(\"display.max_columns\",50)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# download data from here: https://s3.amazonaws.com/pbpstats/db_dumps/raw_shots.csv.zip and put it in working directory\n", "all_shots = pd.read_csv('raw_shots.csv', dtype={'GameId': str})" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "all_shots = all_shots[~(all_shots.X.isna())]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# fill in missing putback column with False\n", "all_shots.Putback.fillna(False, inplace=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def is_regular_season(row):\n", " if row['GameId'][2] == '2':\n", " return True\n", " return False\n", "\n", "all_shots['IsRegularSeason'] = all_shots.apply(is_regular_season, axis=1)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def get_season(row):\n", " # 2017-18 season will be 17, 2016-17 will be 16...\n", " # 2017-18 season for gleague will be 217\n", " # 2018 season for wnba will be 118\n", " return int(row['GameId'][0] + row['GameId'][3:5])\n", "\n", "all_shots['Season'] = all_shots.apply(get_season, axis=1)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# add play start type - start type for possessions with no oreb, missed shot type for shots after orebs\n", "def get_play_start_type(row):\n", " if row['OrebShotType'] is np.nan:\n", " return row['StartType']\n", " elif row['OrebShotType'] in ['Arc3Blocked', 'Corner3Blocked']:\n", " # combine off blocked 3 orebs due to sample size issues\n", " return 'Off3BlockedOreb'\n", " elif row['OrebShotType'] in ['Corner3', 'Arc3']:\n", " # combine off 3 orebs because model does better when they are combined\n", " return 'Off3Oreb'\n", " else:\n", " return 'Off' + row['OrebShotType'] + 'Oreb'\n", "\n", "all_shots['PlayStartType'] = all_shots.apply(get_play_start_type, axis=1) " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def get_time_since_play_started(row):\n", " if row['OrebShotType'] is np.nan:\n", " return row['StartTime'] - row['Time']\n", " else:\n", " # use time since oreb if off oreb\n", " return row['SecondsSinceOReb']\n", "\n", "all_shots['SecondsSincePlayStarted'] = all_shots.apply(get_time_since_play_started, axis=1) " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def get_shot_distance(row):\n", " x_squared = row['X'] ** 2\n", " y_squared = row['Y'] ** 2\n", " shot_distance = math.sqrt(x_squared + y_squared) / 10 # unit for distance is off by factor of 10, divide by 10 to convert to feet\n", " return round(shot_distance, 1)\n", "\n", "all_shots['ShotDistance'] = all_shots.apply(get_shot_distance, axis=1) " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def get_shot_angle_from_centre(row):\n", " # 0 is straightaway, 90 is from the corners\n", " angle = abs(math.degrees(math.atan2(row['X'], row['Y'])))\n", " if angle < 90:\n", " return round(angle, 1)\n", " else:\n", " return round(180 - angle, 1)\n", "\n", "all_shots['ShotAngle'] = all_shots.apply(get_shot_angle_from_centre, axis=1) " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def make_all_ot_periods_5(row):\n", " # small sample size with Period > 5, make all OT periods 5\n", " if row['Period'] > 5:\n", " return 5\n", " else:\n", " return row['Period']\n", "\n", "all_shots['Period'] = all_shots.apply(make_all_ot_periods_5, axis=1)\n", "\n", "base_features = ['Season', 'Made', 'Period', 'StartScoreDifferential', 'ShotValue', 'Time', 'Putback', 'IsRegularSeason', 'PlayStartType', 'SecondsSincePlayStarted', 'ShotDistance', 'ShotAngle']\n", "all_shots = pd.get_dummies(all_shots[base_features], prefix='is')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# group together start types to see what level of detail on start type does better\n", "\n", "def combine_off_block_orebs(row):\n", " if row['is_Off3BlockedOreb'] + row['is_OffAtRimBlockedOreb'] + row['is_OffLongMidRangeBlockedOreb'] + row['is_OffShortMidRangeBlockedOreb'] == 1:\n", " return 1\n", " else:\n", " return 0\n", "\n", "def combine_off_blocks(row):\n", " if row['is_OffArc3Block'] + row['is_OffCorner3Block'] + row['is_OffAtRimBlock'] + row['is_OffLongMidRangeBlock'] + row['is_OffShortMidRangeBlock'] == 1:\n", " return 1\n", " else:\n", " return 0\n", "\n", "def combine_off_makes(row):\n", " if row['is_OffArc3Make'] + row['is_OffCorner3Make'] + row['is_OffAtRimMake'] + row['is_OffLongMidRangeMake'] + row['is_OffShortMidRangeMake'] == 1:\n", " return 1\n", " else:\n", " return 0\n", "\n", "\n", "def combine_off_orebs(row):\n", " if row['is_Off3Oreb'] + row['is_OffAtRimOreb'] + row['is_OffLongMidRangeOreb'] + row['is_OffShortMidRangeOreb'] == 1:\n", " return 1\n", " else:\n", " return 0\n", "\n", "def combine_off_missed_fg(row):\n", " if row['is_OffArc3Miss'] + row['is_OffCorner3Miss'] + row['is_OffAtRimMiss'] + row['is_OffLongMidRangeMiss'] + row['is_OffShortMidRangeMiss'] == 1:\n", " return 1\n", " else:\n", " return 0" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "all_shots['is_OffBlockedOreb'] = all_shots.apply(combine_off_block_orebs, axis=1)\n", "all_shots['is_OffBlock'] = all_shots.apply(combine_off_blocks, axis=1)\n", "all_shots['is_OffMadeFG'] = all_shots.apply(combine_off_makes, axis=1)\n", "all_shots['is_OffOreb'] = all_shots.apply(combine_off_orebs, axis=1)\n", "all_shots['is_OffMissedFG'] = all_shots.apply(combine_off_missed_fg, axis=1)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SeasonMadePeriodStartScoreDifferentialShotValueTimePutbackIsRegularSeasonSecondsSincePlayStartedShotDistanceShotAngleis_Off3BlockedOrebis_Off3Orebis_OffArc3Blockis_OffArc3Makeis_OffArc3Missis_OffAtRimBlockis_OffAtRimBlockedOrebis_OffAtRimMakeis_OffAtRimMissis_OffAtRimOrebis_OffCorner3Blockis_OffCorner3Makeis_OffCorner3Missis_OffDeadballis_OffFTMakeis_OffFTMissis_OffFTOrebis_OffLiveBallTurnoveris_OffLongMidRangeBlockis_OffLongMidRangeBlockedOrebis_OffLongMidRangeMakeis_OffLongMidRangeMissis_OffLongMidRangeOrebis_OffShortMidRangeBlockis_OffShortMidRangeBlockedOrebis_OffShortMidRangeMakeis_OffShortMidRangeMissis_OffShortMidRangeOrebis_OffTeamBlockedOrebis_OffTeamOrebis_OffTimeoutis_OffBlockedOrebis_OffBlockis_OffMadeFGis_OffOrebis_OffMissedFG
07False4192299.0FalseFalse23.021.742.2000000010000000000000000000000000100
17True4-192281.0FalseFalse16.00.00.0000000000000000000000100000000000001
27True4182231.0FalseFalse16.013.564.0000000000000000001000000000000000000
37False4-203195.0FalseFalse7.024.761.5000000000000000001000000000000000000
47False4202180.0FalseFalse14.018.874.9000010000000000000000000000000000001
\n", "
" ], "text/plain": [ " Season Made Period StartScoreDifferential ShotValue Time Putback \\\n", "0 7 False 4 19 2 299.0 False \n", "1 7 True 4 -19 2 281.0 False \n", "2 7 True 4 18 2 231.0 False \n", "3 7 False 4 -20 3 195.0 False \n", "4 7 False 4 20 2 180.0 False \n", "\n", " IsRegularSeason SecondsSincePlayStarted ShotDistance ShotAngle \\\n", "0 False 23.0 21.7 42.2 \n", "1 False 16.0 0.0 0.0 \n", "2 False 16.0 13.5 64.0 \n", "3 False 7.0 24.7 61.5 \n", "4 False 14.0 18.8 74.9 \n", "\n", " is_Off3BlockedOreb is_Off3Oreb is_OffArc3Block is_OffArc3Make \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "\n", " is_OffArc3Miss is_OffAtRimBlock is_OffAtRimBlockedOreb is_OffAtRimMake \\\n", "0 0 0 0 1 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 1 0 0 0 \n", "\n", " is_OffAtRimMiss is_OffAtRimOreb is_OffCorner3Block is_OffCorner3Make \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "\n", " is_OffCorner3Miss is_OffDeadball is_OffFTMake is_OffFTMiss \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "\n", " is_OffFTOreb is_OffLiveBallTurnover is_OffLongMidRangeBlock \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 1 0 \n", "3 0 1 0 \n", "4 0 0 0 \n", "\n", " is_OffLongMidRangeBlockedOreb is_OffLongMidRangeMake \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " is_OffLongMidRangeMiss is_OffLongMidRangeOreb is_OffShortMidRangeBlock \\\n", "0 0 0 0 \n", "1 1 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " is_OffShortMidRangeBlockedOreb is_OffShortMidRangeMake \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " is_OffShortMidRangeMiss is_OffShortMidRangeOreb is_OffTeamBlockedOreb \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " is_OffTeamOreb is_OffTimeout is_OffBlockedOreb is_OffBlock \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "\n", " is_OffMadeFG is_OffOreb is_OffMissedFG \n", "0 1 0 0 \n", "1 0 0 1 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 1 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_shots.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "all_shots.to_csv('data_for_model.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.23.3\n" ] } ], "source": [ "print(pd.__version__)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.14.5\n" ] } ], "source": [ "print(np.__version__)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sys.version_info(major=3, minor=6, micro=1, releaselevel='final', serial=0)\n" ] } ], "source": [ "print(sys.version_info)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 }