{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T10:26:50.542194Z", "iopub.status.busy": "2021-07-26T10:26:50.541490Z", "iopub.status.idle": "2021-07-26T10:32:22.198836Z", "shell.execute_reply": "2021-07-26T10:32:22.199761Z", "shell.execute_reply.started": "2021-07-26T10:20:24.368850Z" }, "papermill": { "duration": 331.690361, "end_time": "2021-07-26T10:32:22.200320", "exception": false, "start_time": "2021-07-26T10:26:50.509959", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "========== train_updated.csv load ==========\n", "********** this is rosters **********\n", "********** this is nextDayPlayerEngagement **********\n", "********** this is playerBoxScores **********\n", "CPU times: user 4min 38s, sys: 14.7 s, total: 4min 52s\n", "Wall time: 5min 31s\n" ] } ], "source": [ "%%time\n", "import numpy as np\n", "import pandas as pd\n", "from pathlib import Path\n", "from sklearn.metrics import mean_absolute_error\n", "from datetime import timedelta\n", "from functools import reduce\n", "from tqdm import tqdm\n", "import lightgbm as lgbm\n", "import mlb\n", "import os\n", "\n", "import gc\n", "\n", "\n", "BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')\n", "#train = pd.read_csv(BASE_DIR / 'train_updated.csv')\n", "if os.path.isfile(BASE_DIR / 'train_updated.csv'):\n", " train = pd.read_csv(BASE_DIR / 'train_updated.csv')\n", " print(10*'=','train_updated.csv','load',10*'=')\n", "else:\n", " train = pd.read_csv(BASE_DIR / 'train.csv')\n", " print(10*'=','train.csv','load',10*'=')\n", " \n", "null = np.nan\n", "true = True\n", "false = False\n", "\n", "for col in ['rosters','nextDayPlayerEngagement','playerBoxScores']:\n", " print(10*'*','this is',col,10*'*')\n", " if col == 'date': continue\n", "\n", " _index = train[col].notnull()\n", " train.loc[_index, col] = train.loc[_index, col].apply(lambda x: eval(x))\n", "\n", " outputs = []\n", " for index, date, record in train.loc[_index, ['date', col]].itertuples():\n", " _df = pd.DataFrame(record)\n", " _df['index'] = index\n", " _df['date'] = date\n", " outputs.append(_df)\n", "\n", " outputs = pd.concat(outputs).reset_index(drop=True)\n", "\n", " outputs.to_csv(f'{col}_train.csv', index=False)\n", " outputs.to_pickle(f'{col}_train.pkl')\n", "\n", " del outputs\n", " del train[col]\n", " gc.collect()\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T10:32:22.246845Z", "iopub.status.busy": "2021-07-26T10:32:22.246101Z", "iopub.status.idle": "2021-07-26T10:32:23.960190Z", "shell.execute_reply": "2021-07-26T10:32:23.959646Z", "shell.execute_reply.started": "2021-07-26T10:26:02.194504Z" }, "papermill": { "duration": 1.737149, "end_time": "2021-07-26T10:32:23.960338", "exception": false, "start_time": "2021-07-26T10:32:22.223189", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')\n", "TRAIN_DIR = Path('./')\n", "\n", "players = pd.read_csv(BASE_DIR / 'players.csv')\n", "\n", "rosters = pd.read_pickle(TRAIN_DIR / 'rosters_train.pkl')\n", "targets = pd.read_pickle(TRAIN_DIR / 'nextDayPlayerEngagement_train.pkl')\n", "scores = pd.read_pickle(TRAIN_DIR / 'playerBoxScores_train.pkl')\n", "scores = scores.groupby(['playerId', 'date']).sum().reset_index()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T10:32:24.013823Z", "iopub.status.busy": "2021-07-26T10:32:24.007884Z", "iopub.status.idle": "2021-07-26T10:32:24.017126Z", "shell.execute_reply": "2021-07-26T10:32:24.016396Z" }, "papermill": { "duration": 0.040202, "end_time": "2021-07-26T10:32:24.017280", "exception": false, "start_time": "2021-07-26T10:32:23.977078", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']\n", "players_cols = ['playerId', 'primaryPositionName','heightInches','weight']\n", "rosters_cols = ['playerId', 'teamId', 'status', 'date']\n", "scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',\n", " 'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',\n", " 'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',\n", " 'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',\n", " 'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',\n", " 'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',\n", " 'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',\n", " 'completeGamesPitching', 'shutoutsPitching', 'winsPitching',\n", " 'lossesPitching', 'flyOutsPitching', 'airOutsPitching',\n", " 'groundOutsPitching', 'runsPitching', 'doublesPitching',\n", " 'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',\n", " 'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',\n", " 'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',\n", " 'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',\n", " 'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',\n", " 'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',\n", " 'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',\n", " 'inheritedRunnersScored', 'catchersInterferencePitching',\n", " 'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',\n", " 'assists', 'putOuts', 'errors', 'chances', 'date']\n", "\n", "feature_cols = ['label_playerId', 'label_primaryPositionName', 'label_teamId',\n", " 'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',\n", " 'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',\n", " 'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',\n", " 'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',\n", " 'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',\n", " 'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',\n", " 'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',\n", " 'completeGamesPitching', 'shutoutsPitching', 'winsPitching',\n", " 'lossesPitching', 'flyOutsPitching', 'airOutsPitching',\n", " 'groundOutsPitching', 'runsPitching', 'doublesPitching',\n", " 'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',\n", " 'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',\n", " 'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',\n", " 'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',\n", " 'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',\n", " 'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',\n", " 'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',\n", " 'inheritedRunnersScored', 'catchersInterferencePitching',\n", " 'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',\n", " 'assists', 'putOuts', 'errors', 'chances','target1_mean',\n", " 'target1_median',\n", " 'target1_std',\n", " 'target1_min',\n", " 'target1_max',\n", " 'target1_prob',\n", " 'target2_mean',\n", " 'target2_median',\n", " 'target2_std',\n", " 'target2_min',\n", " 'target2_max',\n", " 'target2_prob',\n", " 'target3_mean',\n", " 'target3_median',\n", " 'target3_std',\n", " 'target3_min',\n", " 'target3_max',\n", " 'target3_prob',\n", " 'target4_mean',\n", " 'target4_median',\n", " 'target4_std',\n", " 'target4_min',\n", " 'target4_max',\n", " 'target4_prob']\n", "feature_cols2 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',\n", " 'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',\n", " 'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',\n", " 'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',\n", " 'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',\n", " 'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',\n", " 'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',\n", " 'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',\n", " 'completeGamesPitching', 'shutoutsPitching', 'winsPitching',\n", " 'lossesPitching', 'flyOutsPitching', 'airOutsPitching',\n", " 'groundOutsPitching', 'runsPitching', 'doublesPitching',\n", " 'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',\n", " 'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',\n", " 'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',\n", " 'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',\n", " 'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',\n", " 'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',\n", " 'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',\n", " 'inheritedRunnersScored', 'catchersInterferencePitching',\n", " 'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',\n", " 'assists', 'putOuts', 'errors', 'chances','target1_mean',\n", " 'target1_median',\n", " 'target1_std',\n", " 'target1_min',\n", " 'target1_max',\n", " 'target1_prob',\n", " 'target2_mean',\n", " 'target2_median',\n", " 'target2_std',\n", " 'target2_min',\n", " 'target2_max',\n", " 'target2_prob',\n", " 'target3_mean',\n", " 'target3_median',\n", " 'target3_std',\n", " 'target3_min',\n", " 'target3_max',\n", " 'target3_prob',\n", " 'target4_mean',\n", " 'target4_median',\n", " 'target4_std',\n", " 'target4_min',\n", " 'target4_max',\n", " 'target4_prob',\n", " 'target1']" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T10:32:24.057009Z", "iopub.status.busy": "2021-07-26T10:32:24.056180Z", "iopub.status.idle": "2021-07-26T10:32:24.136934Z", "shell.execute_reply": "2021-07-26T10:32:24.136353Z" }, "papermill": { "duration": 0.10236, "end_time": "2021-07-26T10:32:24.137088", "exception": false, "start_time": "2021-07-26T10:32:24.034728", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "['playerId',\n", " 'target1_mean',\n", " 'target1_median',\n", " 'target1_std',\n", " 'target1_min',\n", " 'target1_max',\n", " 'target1_prob',\n", " 'target2_mean',\n", " 'target2_median',\n", " 'target2_std',\n", " 'target2_min',\n", " 'target2_max',\n", " 'target2_prob',\n", " 'target3_mean',\n", " 'target3_median',\n", " 'target3_std',\n", " 'target3_min',\n", " 'target3_max',\n", " 'target3_prob',\n", " 'target4_mean',\n", " 'target4_median',\n", " 'target4_std',\n", " 'target4_min',\n", " 'target4_max',\n", " 'target4_prob']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "player_target_stats = pd.read_csv(\"../input/my-player-target-stat/player_target_stats.csv\")\n", "data_names=player_target_stats.columns.values.tolist()\n", "data_names" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T10:32:24.183998Z", "iopub.status.busy": "2021-07-26T10:32:24.182305Z", "iopub.status.idle": "2021-07-26T10:32:33.730454Z", "shell.execute_reply": "2021-07-26T10:32:33.729892Z" }, "papermill": { "duration": 9.576279, "end_time": "2021-07-26T10:32:33.730615", "exception": false, "start_time": "2021-07-26T10:32:24.154336", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# creat dataset\n", "train = targets[targets_cols].merge(players[players_cols], on=['playerId'], how='left')\n", "train = train.merge(rosters[rosters_cols], on=['playerId', 'date'], how='left')\n", "train = train.merge(scores[scores_cols], on=['playerId', 'date'], how='left')\n", "train = train.merge(player_target_stats, how='inner', left_on=[\"playerId\"],right_on=[\"playerId\"])\n", "\n", "\n", "# label encoding\n", "player2num = {c: i for i, c in enumerate(train['playerId'].unique())}\n", "position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}\n", "teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}\n", "status2num = {c: i for i, c in enumerate(train['status'].unique())}\n", "train['label_playerId'] = train['playerId'].map(player2num)\n", "train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)\n", "train['label_teamId'] = train['teamId'].map(teamid2num)\n", "train['label_status'] = train['status'].map(status2num)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T10:32:33.774093Z", "iopub.status.busy": "2021-07-26T10:32:33.773362Z", "iopub.status.idle": "2021-07-26T10:32:39.909449Z", "shell.execute_reply": "2021-07-26T10:32:39.908813Z" }, "papermill": { "duration": 6.161649, "end_time": "2021-07-26T10:32:39.909604", "exception": false, "start_time": "2021-07-26T10:32:33.747955", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "train_X = train[feature_cols]\n", "train_y = train[['target1', 'target2', 'target3', 'target4']]\n", "\n", "#_index = (train['date'] < 20210401)\n", "_index = ((train['date'] > 20200529) & (train['date'] <= 20200831)) | ((train['date'] > 20190529) & (train['date'] <= 20190831)) | ((train['date'] > 20180529) & (train['date'] <= 20180831))\n", "x_train1 = train_X.loc[~_index].reset_index(drop=True)\n", "y_train1 = train_y.loc[~_index].reset_index(drop=True)\n", "x_valid1 = train_X.loc[_index].reset_index(drop=True)\n", "y_valid1 = train_y.loc[_index].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T10:32:41.209923Z", "iopub.status.busy": "2021-07-26T10:32:41.208914Z", "iopub.status.idle": "2021-07-26T10:32:43.873607Z", "shell.execute_reply": "2021-07-26T10:32:43.873047Z" }, "papermill": { "duration": 3.945406, "end_time": "2021-07-26T10:32:43.873756", "exception": false, "start_time": "2021-07-26T10:32:39.928350", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "train_X = train[feature_cols2]\n", "train_y = train[['target1', 'target2', 'target3', 'target4']]\n", "\n", "#_index = (train['date'] < 20210401)\n", "x_train2 = train_X.loc[~_index].reset_index(drop=True)\n", "y_train2 = train_y.loc[~_index].reset_index(drop=True)\n", "x_valid2 = train_X.loc[_index].reset_index(drop=True)\n", "y_valid2 = train_y.loc[_index].reset_index(drop=True)" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.016809, "end_time": "2021-07-26T10:32:43.907896", "exception": false, "start_time": "2021-07-26T10:32:43.891087", "status": "completed" }, "tags": [] }, "source": [ "# LGB" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T10:32:43.951312Z", "iopub.status.busy": "2021-07-26T10:32:43.950568Z", "iopub.status.idle": "2021-07-26T11:14:16.906091Z", "shell.execute_reply": "2021-07-26T11:14:16.906631Z" }, "papermill": { "duration": 2492.981885, "end_time": "2021-07-26T11:14:16.906885", "exception": false, "start_time": "2021-07-26T10:32:43.925000", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[LightGBM] [Warning] feature_fraction is set=0.8101240539122566, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8101240539122566\n", "[LightGBM] [Warning] bagging_freq is set=8, subsample_freq=0 will be ignored. Current value: bagging_freq=8\n", "[LightGBM] [Warning] bagging_fraction is set=0.8884451442950513, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8884451442950513\n", "Training until validation scores don't improve for 100 rounds\n", "[100]\tvalid_0's l1: 0.603681\n", "[200]\tvalid_0's l1: 0.602494\n", "[300]\tvalid_0's l1: 0.602064\n", "[400]\tvalid_0's l1: 0.602072\n", "[500]\tvalid_0's l1: 0.601976\n", "[600]\tvalid_0's l1: 0.601959\n", "[700]\tvalid_0's l1: 0.601676\n", "Early stopping, best iteration is:\n", "[673]\tvalid_0's l1: 0.601666\n", "mae: 0.6016661242531858\n", "[LightGBM] [Warning] feature_fraction is set=0.9101240539122566, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.9101240539122566\n", "[LightGBM] [Warning] bagging_freq is set=3, subsample_freq=0 will be ignored. Current value: bagging_freq=3\n", "[LightGBM] [Warning] bagging_fraction is set=0.9884451442950513, subsample=1.0 will be ignored. Current value: bagging_fraction=0.9884451442950513\n", "Training until validation scores don't improve for 100 rounds\n", "[100]\tvalid_0's l1: 1.72755\n", "[200]\tvalid_0's l1: 1.71505\n", "[300]\tvalid_0's l1: 1.71076\n", "[400]\tvalid_0's l1: 1.70847\n", "[500]\tvalid_0's l1: 1.70694\n", "[600]\tvalid_0's l1: 1.70668\n", "[700]\tvalid_0's l1: 1.7052\n", "[800]\tvalid_0's l1: 1.70479\n", "[900]\tvalid_0's l1: 1.70432\n", "[1000]\tvalid_0's l1: 1.7037\n", "[1100]\tvalid_0's l1: 1.703\n", "[1200]\tvalid_0's l1: 1.70253\n", "[1300]\tvalid_0's l1: 1.70207\n", "[1400]\tvalid_0's l1: 1.702\n", "[1500]\tvalid_0's l1: 1.70185\n", "[1600]\tvalid_0's l1: 1.7011\n", "[1700]\tvalid_0's l1: 1.70071\n", "[1800]\tvalid_0's l1: 1.70073\n", "[1900]\tvalid_0's l1: 1.70063\n", "[2000]\tvalid_0's l1: 1.7004\n", "[2100]\tvalid_0's l1: 1.70023\n", "[2200]\tvalid_0's l1: 1.69993\n", "Early stopping, best iteration is:\n", "[2192]\tvalid_0's l1: 1.69993\n", "mae: 1.699927207171509\n", "[LightGBM] [Warning] bagging_fraction is set=0.5637405128936662, subsample=1.0 will be ignored. Current value: bagging_fraction=0.5637405128936662\n", "[LightGBM] [Warning] feature_fraction is set=0.5419185713426886, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5419185713426886\n", "[LightGBM] [Warning] bagging_freq is set=15, subsample_freq=0 will be ignored. Current value: bagging_freq=15\n", "Training until validation scores don't improve for 100 rounds\n", "[100]\tvalid_0's l1: 0.715969\n", "[200]\tvalid_0's l1: 0.714891\n", "[300]\tvalid_0's l1: 0.714887\n", "[400]\tvalid_0's l1: 0.714884\n", "[500]\tvalid_0's l1: 0.714883\n", "[600]\tvalid_0's l1: 0.714882\n", "[700]\tvalid_0's l1: 0.714879\n", "[800]\tvalid_0's l1: 0.714878\n", "[900]\tvalid_0's l1: 0.714714\n", "[1000]\tvalid_0's l1: 0.714713\n", "[1100]\tvalid_0's l1: 0.714712\n", "[1200]\tvalid_0's l1: 0.714711\n", "[1300]\tvalid_0's l1: 0.714711\n", "[1400]\tvalid_0's l1: 0.71471\n", "[1500]\tvalid_0's l1: 0.714709\n", "[1600]\tvalid_0's l1: 0.714709\n", "[1700]\tvalid_0's l1: 0.71455\n", "[1800]\tvalid_0's l1: 0.71455\n", "[1900]\tvalid_0's l1: 0.714549\n", "[2000]\tvalid_0's l1: 0.714549\n", "[2100]\tvalid_0's l1: 0.714548\n", "[2200]\tvalid_0's l1: 0.714548\n", "[2300]\tvalid_0's l1: 0.714547\n", "[2400]\tvalid_0's l1: 0.714547\n", "[2500]\tvalid_0's l1: 0.714546\n", "[2600]\tvalid_0's l1: 0.714546\n", "[2700]\tvalid_0's l1: 0.714545\n", "[2800]\tvalid_0's l1: 0.714545\n", "[2900]\tvalid_0's l1: 0.714544\n", "[3000]\tvalid_0's l1: 0.714543\n", "[3100]\tvalid_0's l1: 0.714543\n", "[3200]\tvalid_0's l1: 0.714543\n", "[3300]\tvalid_0's l1: 0.714542\n", "[3400]\tvalid_0's l1: 0.714542\n", "[3500]\tvalid_0's l1: 0.714543\n", "Early stopping, best iteration is:\n", "[3473]\tvalid_0's l1: 0.714542\n", "mae: 0.7145421805738932\n", "[LightGBM] [Warning] feature_fraction is set=0.5419185713426886, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5419185713426886\n", "[LightGBM] [Warning] bagging_freq is set=19, subsample_freq=0 will be ignored. Current value: bagging_freq=19\n", "[LightGBM] [Warning] bagging_fraction is set=0.2637405128936662, subsample=1.0 will be ignored. Current value: bagging_fraction=0.2637405128936662\n", "Training until validation scores don't improve for 100 rounds\n", "[100]\tvalid_0's l1: 0.82029\n", "[200]\tvalid_0's l1: 0.817858\n", "[300]\tvalid_0's l1: 0.816503\n", "[400]\tvalid_0's l1: 0.815647\n", "[500]\tvalid_0's l1: 0.815143\n", "[600]\tvalid_0's l1: 0.814731\n", "[700]\tvalid_0's l1: 0.814472\n", "[800]\tvalid_0's l1: 0.814144\n", "[900]\tvalid_0's l1: 0.813819\n", "[1000]\tvalid_0's l1: 0.813664\n", "[1100]\tvalid_0's l1: 0.813584\n", "Early stopping, best iteration is:\n", "[1053]\tvalid_0's l1: 0.813545\n", "mae: 0.8135446889692893\n", "score: 0.9574200502419693\n" ] } ], "source": [ "def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):\n", " oof_pred = np.zeros(len(y_valid), dtype=np.float32)\n", " model = lgbm.LGBMRegressor(**params)\n", " model.fit(x_train, y_train, \n", " eval_set=[(x_valid, y_valid)], \n", " early_stopping_rounds=verbose, \n", " verbose=verbose)\n", " oof_pred = model.predict(x_valid)\n", " score = mean_absolute_error(oof_pred, y_valid)\n", " print('mae:', score)\n", " return oof_pred, model, score\n", "\n", "\n", "params1 = {'objective':'mae',\n", " 'reg_alpha': 0.14547461820098767, \n", " 'reg_lambda': 0.10185644384043743, \n", " 'n_estimators': 3333, \n", " 'learning_rate': 0.1046301304430488, \n", " 'num_leaves': 674, \n", " 'feature_fraction': 0.8101240539122566, \n", " 'bagging_fraction': 0.8884451442950513, \n", " 'bagging_freq': 8, \n", " 'min_child_samples': 51}\n", "\n", "params2 = {\n", " 'objective':'mae',\n", " 'reg_alpha': 0.14947461820098767, \n", " 'reg_lambda': 0.10185644384043743, \n", " 'n_estimators': 3633, \n", " 'learning_rate': 0.08046301304430488, \n", " 'num_leaves': 64, \n", " 'feature_fraction': 0.9101240539122566, \n", " 'bagging_fraction': 0.9884451442950513, \n", " 'bagging_freq': 3, \n", " 'min_child_samples': 15\n", "}\n", "\n", "params4 = {'objective':'mae',\n", " 'reg_alpha': 0.016468100279441976, \n", " 'reg_lambda': 0.09128335764019105, \n", " 'n_estimators': 9868, \n", " 'learning_rate': 0.10528150510326864, \n", " 'num_leaves': 157, \n", " 'feature_fraction': 0.5419185713426886, \n", " 'bagging_fraction': 0.2637405128936662, \n", " 'bagging_freq': 19, \n", " 'min_child_samples': 71}\n", "\n", "\n", "params = {\n", " 'objective':'mae',\n", "# 'reg_alpha': 0.1,\n", "# 'reg_lambda': 0.1, \n", " 'n_estimators': 10000,\n", " 'learning_rate': 0.1,\n", " 'random_state': 2021,\n", " \"num_leaves\": 127,\n", " 'feature_fraction': 0.5419185713426886, \n", " 'bagging_fraction': 0.5637405128936662, \n", " 'bagging_freq': 15, \n", "}\n", "\n", "\n", "\n", "oof1, model1, score1 = fit_lgbm(\n", " x_train1, y_train1['target1'],\n", " x_valid1, y_valid1['target1'],\n", " params1\n", " )\n", "\n", "oof2, model2, score2 = fit_lgbm(\n", " x_train2, y_train2['target2'],\n", " x_valid2, y_valid2['target2'],\n", " params2\n", ")\n", "\n", "oof3, model3, score3 = fit_lgbm(\n", " x_train2, y_train2['target3'],\n", " x_valid2, y_valid2['target3'],\n", " params\n", ")\n", "\n", "oof4, model4, score4 = fit_lgbm(\n", " x_train2, y_train2['target4'],\n", " x_valid2, y_valid2['target4'],\n", " params4\n", ")\n", "\n", "score = (score1+score2+score3+score4) / 4\n", "print(f'score: {score}')" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.044368, "end_time": "2021-07-26T11:14:16.995082", "exception": false, "start_time": "2021-07-26T11:14:16.950714", "status": "completed" }, "tags": [] }, "source": [ "# Cat" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T11:14:17.101601Z", "iopub.status.busy": "2021-07-26T11:14:17.100913Z", "iopub.status.idle": "2021-07-26T11:29:55.386184Z", "shell.execute_reply": "2021-07-26T11:29:55.385613Z" }, "papermill": { "duration": 938.347381, "end_time": "2021-07-26T11:29:55.386423", "exception": false, "start_time": "2021-07-26T11:14:17.039042", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "********** <_io.BufferedReader name='../input/mlb-lightgbm-training/mymodel_lgb_2.pkl'> **********\n", "mae: 1.7495870137280762\n", "********** <_io.BufferedReader name='../input/mlb-catboost-training/mymodel_cb_2.pkl'> **********\n", "mae: 1.8092167805205939\n", "********** <_io.BufferedReader name='../input/mlb-lightgbm-training/mymodel_lgb_1.pkl'> **********\n", "mae: 0.613446416791378\n", "********** <_io.BufferedReader name='../input/mlb-catboost-training/mymodel_cb_1.pkl'> **********\n", "mae: 0.6265251914020156\n", "********** <_io.BufferedReader name='../input/mlb-lightgbm-training/mymodel_lgb_3.pkl'> **********\n", "mae: 0.7371395237381603\n", "********** <_io.BufferedReader name='../input/mlb-catboost-training/mymodel_cb_3.pkl'> **********\n", "mae: 0.7406471371942968\n", "********** <_io.BufferedReader name='../input/mlb-lightgbm-training/mymodel_lgb_4.pkl'> **********\n", "mae: 0.8178008821616221\n", "********** <_io.BufferedReader name='../input/mlb-catboost-training/mymodel_cb_4.pkl'> **********\n", "mae: 0.846635209540955\n", "LightGBM score: 0.9794934591048092\n", "Catboost score: 1.0057560796644653\n" ] } ], "source": [ "import pickle\n", "from catboost import CatBoostRegressor\n", "\n", "def fit_lgbm(x_train, y_train, x_valid, y_valid, target, params: dict=None, verbose=100):\n", " oof_pred_lgb = np.zeros(len(y_valid), dtype=np.float32)\n", " oof_pred_cat = np.zeros(len(y_valid), dtype=np.float32)\n", " \n", " if os.path.isfile(f'../input/mlb-lightgbm-training/mymodel_lgb_{target}.pkl'):\n", " with open(f'../input/mlb-lightgbm-training/mymodel_lgb_{target}.pkl', 'rb') as fin:\n", " model = pickle.load(fin)\n", " oof_pred_lgb = model.predict(x_valid)\n", " score_lgb = mean_absolute_error(oof_pred_lgb, y_valid)\n", " print('*'*10,fin,'*'*10)\n", " print('mae:', score_lgb)\n", " else:\n", " with open(f'mymodel_lgb_{target}.pkl', 'wb') as handle:\n", " pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", " \n", "\n", " \n", " if os.path.isfile(f'../input/mlb-catboost-training/mymodel_cb_{target}.pkl'):\n", " with open(f'../input/mlb-catboost-training/mymodel_cb_{target}.pkl', 'rb') as fin:\n", " model_cb = pickle.load(fin)\n", " oof_pred_cat = model_cb.predict(x_valid)\n", " score_cat = mean_absolute_error(oof_pred_cat, y_valid)\n", " print('*'*10,fin,'*'*10)\n", " print('mae:', score_cat)\n", " \n", " else:\n", "\n", " with open(f'model_cb_{target}.pkl', 'wb') as handle:\n", " pickle.dump(model_cb, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", " \n", "\n", " return oof_pred_lgb, model, oof_pred_cat, model_cb, score_lgb, score_cat\n", "\n", "\n", "params = {\n", "'boosting_type': 'gbdt',\n", "'objective':'mae',\n", "'subsample': 0.6,\n", "'subsample_freq': 1,\n", "'learning_rate': 0.03,\n", "'num_leaves': 2**11-1,\n", "'min_data_in_leaf': 2**12-1,\n", "'feature_fraction': 0.6,\n", "'max_bin': 100,\n", "'n_estimators': 2500,\n", "'boost_from_average': False,\n", "\"random_seed\":2021,\n", "}\n", "\n", "oof_pred_lgb2, model_lgb2, oof_pred_cat2, model_cb2, score_lgb2, score_cat2 = fit_lgbm(\n", " x_train1, y_train1['target2'],\n", " x_valid1, y_valid1['target2'],\n", " 2, params\n", ")\n", "\n", "oof_pred_lgb1, model_lgb1, oof_pred_cat1, model_cb1, score_lgb1, score_cat1 = fit_lgbm(\n", " x_train1, y_train1['target1'],\n", " x_valid1, y_valid1['target1'],\n", " 1, params\n", ")\n", "\n", "oof_pred_lgb3, model_lgb3, oof_pred_cat3, model_cb3, score_lgb3, score_cat3 = fit_lgbm(\n", " x_train1, y_train1['target3'],\n", " x_valid1, y_valid1['target3'],\n", " 3, params\n", ")\n", "oof_pred_lgb4, model_lgb4, oof_pred_cat4, model_cb4, score_lgb4, score_cat4= fit_lgbm(\n", " x_train1, y_train1['target4'],\n", " x_valid1, y_valid1['target4'],\n", " 4, params\n", ")\n", "\n", "score = (score_lgb1+score_lgb2+score_lgb3+score_lgb4) / 4\n", "print(f'LightGBM score: {score}')\n", "\n", "score = (score_cat1+score_cat2+score_cat3+score_cat4) / 4\n", "print(f'Catboost score: {score}')" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.054535, "end_time": "2021-07-26T11:29:55.501236", "exception": false, "start_time": "2021-07-26T11:29:55.446701", "status": "completed" }, "tags": [] }, "source": [ "# ANN" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T11:29:55.620861Z", "iopub.status.busy": "2021-07-26T11:29:55.620092Z", "iopub.status.idle": "2021-07-26T11:29:55.633123Z", "shell.execute_reply": "2021-07-26T11:29:55.633643Z" }, "papermill": { "duration": 0.066731, "end_time": "2021-07-26T11:29:55.633849", "exception": false, "start_time": "2021-07-26T11:29:55.567118", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "players_cols = ['playerId', 'primaryPositionName']\n", "rosters_cols = ['playerId', 'teamId', 'status']\n", "scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',\n", " 'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',\n", " 'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',\n", " 'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',\n", " 'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',\n", " 'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',\n", " 'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',\n", " 'completeGamesPitching', 'shutoutsPitching', 'winsPitching',\n", " 'lossesPitching', 'flyOutsPitching', 'airOutsPitching',\n", " 'groundOutsPitching', 'runsPitching', 'doublesPitching',\n", " 'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',\n", " 'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',\n", " 'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',\n", " 'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',\n", " 'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',\n", " 'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',\n", " 'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',\n", " 'inheritedRunnersScored', 'catchersInterferencePitching',\n", " 'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',\n", " 'assists', 'putOuts', 'errors', 'chances']\n", "\n", "null = np.nan\n", "true = True\n", "false = False" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T11:29:55.782230Z", "iopub.status.busy": "2021-07-26T11:29:55.769001Z", "iopub.status.idle": "2021-07-26T11:31:40.775987Z", "shell.execute_reply": "2021-07-26T11:31:40.776546Z" }, "papermill": { "duration": 105.090934, "end_time": "2021-07-26T11:31:40.776753", "exception": false, "start_time": "2021-07-26T11:29:55.685819", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(2506176, 6)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [01:08<00:00, 3.43s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "(2506176, 87)\n", "(2464956, 87)\n", "Model: \"ANN\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "inputs (InputLayer) [(None, 84)] 0 \n", "_________________________________________________________________\n", "d1 (Dense) (None, 50) 4250 \n", "_________________________________________________________________\n", "d2 (Dense) (None, 50) 2550 \n", "_________________________________________________________________\n", "preds (Dense) (None, 4) 204 \n", "=================================================================\n", "Total params: 7,004\n", "Trainable params: 7,004\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "None\n", "FOLD: 0\n", "10/10 [==============================] - 0s 17ms/step\n", "FOLD: 1\n", "10/10 [==============================] - 0s 14ms/step\n", "FOLD: 2\n", "10/10 [==============================] - 0s 15ms/step\n", "FOLD: 3\n", "10/10 [==============================] - 0s 14ms/step\n", "FOLD: 4\n", "10/10 [==============================] - 0s 16ms/step\n", "mae: 0.7727517316297969\n", "mse: 3.9314386784209567\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from datetime import timedelta\n", "from tqdm import tqdm\n", "import gc\n", "from functools import reduce\n", "from sklearn.model_selection import StratifiedKFold\n", "\n", "ROOT_DIR = \"../input/mlb-player-digital-engagement-forecasting\"\n", "\n", "#=======================#\n", "def flatten(df, col):\n", " du = (df.pivot(index=\"playerId\", columns=\"EvalDate\", \n", " values=col).add_prefix(f\"{col}_\").\n", " rename_axis(None, axis=1).reset_index())\n", " return du\n", "#============================#\n", "def reducer(left, right):\n", " return left.merge(right, on=\"playerId\")\n", "#========================\n", "\n", "TGTCOLS = [\"target1\",\"target2\",\"target3\",\"target4\"]\n", "def train_lag(df, lag=1):\n", " dp = df[[\"playerId\",\"EvalDate\"]+TGTCOLS].copy()\n", " dp[\"EvalDate\"] =dp[\"EvalDate\"] + timedelta(days=lag) \n", " df = df.merge(dp, on=[\"playerId\", \"EvalDate\"], suffixes=[\"\",f\"_{lag}\"], how=\"left\")\n", " return df\n", "#=================================\n", "def test_lag(sub):\n", " sub[\"playerId\"] = sub[\"date_playerId\"].apply(lambda s: int( s.split(\"_\")[1] ) )\n", " assert sub.date.nunique() == 1\n", " dte = sub[\"date\"].unique()[0]\n", " \n", " eval_dt = pd.to_datetime(dte, format=\"%Y%m%d\")\n", " dtes = [eval_dt + timedelta(days=-k) for k in LAGS]\n", " mp_dtes = {eval_dt + timedelta(days=-k):k for k in LAGS}\n", " \n", " sl = LAST.loc[LAST.EvalDate.between(dtes[-1], dtes[0]), [\"EvalDate\",\"playerId\"]+TGTCOLS].copy()\n", " sl[\"EvalDate\"] = sl[\"EvalDate\"].map(mp_dtes)\n", " du = [flatten(sl, col) for col in TGTCOLS]\n", " du = reduce(reducer, du)\n", " return du, eval_dt\n", " #\n", "#===============\n", "\n", "tr = pd.read_csv(\"../input/my-mlb-data/target.csv\")\n", "print(tr.shape)\n", "gc.collect()\n", "\n", "tr[\"EvalDate\"] = pd.to_datetime(tr[\"EvalDate\"])\n", "tr[\"EvalDate\"] = tr[\"EvalDate\"] + timedelta(days=-1)\n", "tr[\"EvalYear\"] = tr[\"EvalDate\"].dt.year\n", "\n", "MED_DF = tr.groupby([\"playerId\",\"EvalYear\"])[TGTCOLS].median().reset_index()\n", "MEDCOLS = [\"tgt1_med\",\"tgt2_med\", \"tgt3_med\", \"tgt4_med\"]\n", "MED_DF.columns = [\"playerId\",\"EvalYear\"] + MEDCOLS\n", "\n", "LAGS = list(range(1,21))\n", "FECOLS = [f\"{col}_{lag}\" for lag in reversed(LAGS) for col in TGTCOLS]\n", "\n", "for lag in tqdm(LAGS):\n", " tr = train_lag(tr, lag=lag)\n", " gc.collect()\n", "#===========\n", "tr = tr.sort_values(by=[\"playerId\", \"EvalDate\"])\n", "print(tr.shape)\n", "tr = tr.dropna()\n", "print(tr.shape)\n", "tr = tr.merge(MED_DF, on=[\"playerId\",\"EvalYear\"])\n", "gc.collect()\n", "\n", "X = tr[FECOLS+MEDCOLS].values\n", "y = tr[TGTCOLS].values\n", "cl = tr[\"playerId\"].values\n", "\n", "NFOLDS = 5\n", "skf = StratifiedKFold(n_splits=NFOLDS)\n", "folds = skf.split(X, cl)\n", "folds = list(folds)\n", "\n", "import tensorflow as tf\n", "import tensorflow.keras.layers as L\n", "import tensorflow.keras.models as M\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", "from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping\n", "\n", "tf.random.set_seed(2021)\n", "\n", "def make_model(n_in):\n", " inp = L.Input(name=\"inputs\", shape=(n_in,))\n", " x = L.Dense(50, activation=\"relu\", name=\"d1\")(inp)\n", " x = L.Dense(50, activation=\"relu\", name=\"d2\")(x)\n", " preds = L.Dense(4, activation=\"linear\", name=\"preds\")(x)\n", " \n", " model = M.Model(inp, preds, name=\"ANN\")\n", " model.compile(loss=\"mean_absolute_error\", optimizer=\"adam\")\n", " return model\n", "\n", "net = make_model(X.shape[1])\n", "print(net.summary())\n", "\n", "oof = np.zeros(y.shape)\n", "nets = []\n", "for idx in range(NFOLDS):\n", " print(\"FOLD:\", idx)\n", " tr_idx, val_idx = folds[idx]\n", " ckpt = ModelCheckpoint(f\"../input/mlb-ann-training/w{idx}.h5\", monitor='val_loss', verbose=1, save_best_only=True,mode='min')\n", " reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=3, min_lr=0.0001)\n", " es = EarlyStopping(monitor='val_loss', patience=5)\n", " reg = make_model(X.shape[1])\n", "# reg.fit(X[tr_idx], y[tr_idx], epochs=10, batch_size=30_000, validation_data=(X[val_idx], y[val_idx]),\n", "# verbose=1, callbacks=[ckpt, reduce_lr, es])\n", " reg.load_weights(f\"../input/mlb-ann-training/w{idx}.h5\")\n", " oof[val_idx] = reg.predict(X[val_idx], batch_size=50_000, verbose=1)\n", " nets.append(reg)\n", " gc.collect()\n", "\n", "mae = mean_absolute_error(y, oof)\n", "mse = mean_squared_error(y, oof, squared=False)\n", "print(\"mae:\", mae)\n", "print(\"mse:\", mse)\n", "\n", "# Historical information to use in prediction time\n", "bound_dt = pd.to_datetime(\"2021-01-01\")\n", "LAST = tr.loc[tr.EvalDate>bound_dt].copy()\n", "\n", "LAST_MED_DF = MED_DF.loc[MED_DF.EvalYear==2021].copy()\n", "LAST_MED_DF.drop(\"EvalYear\", axis=1, inplace=True)\n", "del tr\n", "\n", "#\"\"\"\n", "import mlb\n", "FE = []; SUB = [];" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.064615, "end_time": "2021-07-26T11:31:40.909832", "exception": false, "start_time": "2021-07-26T11:31:40.845217", "status": "completed" }, "tags": [] }, "source": [ "# Predict" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T11:31:41.068184Z", "iopub.status.busy": "2021-07-26T11:31:41.048554Z", "iopub.status.idle": "2021-07-26T11:32:04.799825Z", "shell.execute_reply": "2021-07-26T11:32:04.800318Z" }, "papermill": { "duration": 23.827166, "end_time": "2021-07-26T11:32:04.800508", "exception": false, "start_time": "2021-07-26T11:31:40.973342", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.\n" ] } ], "source": [ "import copy\n", "\n", "env = mlb.make_env() # initialize the environment\n", "iter_test = env.iter_test() # iterator which loops over each date in test set\n", "\n", "for (test_df, sample_prediction_df) in iter_test: # make predictions here\n", " \n", " sub = copy.deepcopy(sample_prediction_df.reset_index())\n", " sample_prediction_df = copy.deepcopy(sample_prediction_df.reset_index(drop=True))\n", " \n", " # LGBM summit\n", " # creat dataset\n", " sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\\\n", " .map(lambda x: int(x.split('_')[1]))\n", " # Dealing with missing values\n", " if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:\n", " test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))\n", " else:\n", " test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})\n", " for col in rosters.columns:\n", " if col == 'playerId': continue\n", " test_rosters[col] = np.nan\n", " \n", " if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:\n", " test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))\n", " else:\n", " test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})\n", " for col in scores.columns:\n", " if col == 'playerId': continue\n", " test_scores[col] = np.nan\n", " test_scores = test_scores.groupby('playerId').sum().reset_index()\n", " test = sample_prediction_df[['playerId']].copy()\n", " test = test.merge(players[players_cols], on='playerId', how='left')\n", " test = test.merge(test_rosters[rosters_cols], on='playerId', how='left')\n", " test = test.merge(test_scores[scores_cols], on='playerId', how='left')\n", " test = test.merge(player_target_stats, how='inner', left_on=[\"playerId\"],right_on=[\"playerId\"])\n", " \n", "\n", " test['label_playerId'] = test['playerId'].map(player2num)\n", " test['label_primaryPositionName'] = test['primaryPositionName'].map(position2num)\n", " test['label_teamId'] = test['teamId'].map(teamid2num)\n", " test['label_status'] = test['status'].map(status2num)\n", " \n", " test_X = test[feature_cols]\n", " # predict\n", " pred1 = model1.predict(test_X)\n", " \n", " # predict\n", " pred_lgd1 = model_lgb1.predict(test_X)\n", " pred_lgd2 = model_lgb2.predict(test_X)\n", " pred_lgd3 = model_lgb3.predict(test_X)\n", " pred_lgd4 = model_lgb4.predict(test_X)\n", " \n", " pred_cat1 = model_cb1.predict(test_X)\n", " pred_cat2 = model_cb2.predict(test_X)\n", " pred_cat3 = model_cb3.predict(test_X)\n", " pred_cat4 = model_cb4.predict(test_X)\n", " \n", " test['target1'] = np.clip(pred1,0,100)\n", " test_X = test[feature_cols2]\n", "\n", " pred2 = model2.predict(test_X)\n", " pred3 = model3.predict(test_X)\n", " pred4 = model4.predict(test_X)\n", " \n", " # merge submission\n", " sample_prediction_df['target1'] = 1.00*np.clip(pred1, 0, 100)+0.00*np.clip(pred_lgd1, 0, 100)+0.00*np.clip(pred_cat1, 0, 100)\n", " sample_prediction_df['target2'] = 0.05*np.clip(pred2, 0, 100)+0.54*np.clip(pred_lgd2, 0, 100)+0.405*np.clip(pred_cat2, 0, 100)\n", " sample_prediction_df['target3'] = 0.76*np.clip(pred3, 0, 100)+0.14*np.clip(pred_lgd3, 0, 100)+0.10*np.clip(pred_cat3, 0, 100)\n", " sample_prediction_df['target4'] = 0.77*np.clip(pred4, 0, 100)+0.13*np.clip(pred_lgd4, 0, 100)+0.10*np.clip(pred_cat4, 0, 100)\n", " sample_prediction_df = sample_prediction_df.fillna(0.)\n", " del sample_prediction_df['playerId']\n", " # TF summit\n", " # Features computation at Evaluation Date\n", " sub_fe, eval_dt = test_lag(sub)\n", " sub_fe = sub_fe.merge(LAST_MED_DF, on=\"playerId\", how=\"left\")\n", " sub_fe = sub_fe.fillna(0.)\n", " \n", " _preds = 0.\n", " for reg in nets:\n", " _preds += reg.predict(sub_fe[FECOLS + MEDCOLS]) / NFOLDS\n", " sub_fe[TGTCOLS] = np.clip(_preds, 0, 100)\n", " sub.drop([\"date\"]+TGTCOLS, axis=1, inplace=True)\n", " sub = sub.merge(sub_fe[[\"playerId\"]+TGTCOLS], on=\"playerId\", how=\"left\")\n", " sub.drop(\"playerId\", axis=1, inplace=True)\n", " sub = sub.fillna(0.)\n", " # Blending\n", " blend = pd.concat(\n", " [sub[['date_playerId']],\n", " (0.22*sub.drop('date_playerId', axis=1) + 0.78*sample_prediction_df.drop('date_playerId', axis=1))],\n", " axis=1\n", " )\n", " env.predict(blend)\n", " # Update Available information\n", " sub_fe[\"EvalDate\"] = eval_dt\n", " #sub_fe.drop(MEDCOLS, axis=1, inplace=True)\n", " LAST = LAST.append(sub_fe)\n", " LAST = LAST.drop_duplicates(subset=[\"EvalDate\",\"playerId\"], keep=\"last\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T11:32:04.944978Z", "iopub.status.busy": "2021-07-26T11:32:04.944303Z", "iopub.status.idle": "2021-07-26T11:32:04.957672Z", "shell.execute_reply": "2021-07-26T11:32:04.958218Z" }, "papermill": { "duration": 0.094386, "end_time": "2021-07-26T11:32:04.958407", "exception": false, "start_time": "2021-07-26T11:32:04.864021", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
date_playerIdtarget1target2target3target4
020210501_4887261.4178335.6505576.859939e-021.995048
120210501_6052180.0035360.3963551.702673e-030.868064
220210501_6215630.0992562.3868647.695223e-020.771513
320210501_6700840.0223010.8786446.095328e-040.277289
420210501_6709700.0103070.2510982.952593e-020.118300
..................
118220210501_5960490.0002760.0096442.608818e-120.035421
118320210501_6428510.0001760.0413581.818041e-070.079906
118420210501_5960710.0004510.0836871.183390e-040.070381
118520210501_6649010.0033080.3090833.068393e-020.199449
118620210501_6055250.0026550.5558474.771943e-040.116648
\n", "

1187 rows × 5 columns

\n", "
" ], "text/plain": [ " date_playerId target1 target2 target3 target4\n", "0 20210501_488726 1.417833 5.650557 6.859939e-02 1.995048\n", "1 20210501_605218 0.003536 0.396355 1.702673e-03 0.868064\n", "2 20210501_621563 0.099256 2.386864 7.695223e-02 0.771513\n", "3 20210501_670084 0.022301 0.878644 6.095328e-04 0.277289\n", "4 20210501_670970 0.010307 0.251098 2.952593e-02 0.118300\n", "... ... ... ... ... ...\n", "1182 20210501_596049 0.000276 0.009644 2.608818e-12 0.035421\n", "1183 20210501_642851 0.000176 0.041358 1.818041e-07 0.079906\n", "1184 20210501_596071 0.000451 0.083687 1.183390e-04 0.070381\n", "1185 20210501_664901 0.003308 0.309083 3.068393e-02 0.199449\n", "1186 20210501_605525 0.002655 0.555847 4.771943e-04 0.116648\n", "\n", "[1187 rows x 5 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(\n", " [sub[['date_playerId']],\n", " (sub.drop('date_playerId', axis=1) + sample_prediction_df.drop('date_playerId', axis=1)) / 2],\n", " axis=1\n", ")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2021-07-26T11:32:05.091936Z", "iopub.status.busy": "2021-07-26T11:32:05.090877Z", "iopub.status.idle": "2021-07-26T11:32:05.108982Z", "shell.execute_reply": "2021-07-26T11:32:05.108454Z" }, "papermill": { "duration": 0.086599, "end_time": "2021-07-26T11:32:05.109124", "exception": false, "start_time": "2021-07-26T11:32:05.022525", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
date_playerIdtarget1target2target3target4
020210501_4887262.728486e+008.0225091.070538e-012.425154
120210501_6052182.152436e-030.5665613.182109e-030.828016
220210501_6215631.422361e-012.7632331.348766e-020.880154
320210501_6700842.031692e-030.6473631.219066e-030.119984
420210501_6709705.941349e-040.1629239.765327e-030.048072
..................
118220210501_5960499.109051e-150.0168955.217635e-120.032081
118320210501_6428510.000000e+000.0725133.636082e-070.081339
118420210501_5960711.820810e-040.1094631.856178e-050.085482
118520210501_6649016.615386e-030.3588203.113339e-030.202974
118620210501_6055258.827960e-040.5934314.494436e-100.107011
\n", "

1187 rows × 5 columns

\n", "
" ], "text/plain": [ " date_playerId target1 target2 target3 target4\n", "0 20210501_488726 2.728486e+00 8.022509 1.070538e-01 2.425154\n", "1 20210501_605218 2.152436e-03 0.566561 3.182109e-03 0.828016\n", "2 20210501_621563 1.422361e-01 2.763233 1.348766e-02 0.880154\n", "3 20210501_670084 2.031692e-03 0.647363 1.219066e-03 0.119984\n", "4 20210501_670970 5.941349e-04 0.162923 9.765327e-03 0.048072\n", "... ... ... ... ... ...\n", "1182 20210501_596049 9.109051e-15 0.016895 5.217635e-12 0.032081\n", "1183 20210501_642851 0.000000e+00 0.072513 3.636082e-07 0.081339\n", "1184 20210501_596071 1.820810e-04 0.109463 1.856178e-05 0.085482\n", "1185 20210501_664901 6.615386e-03 0.358820 3.113339e-03 0.202974\n", "1186 20210501_605525 8.827960e-04 0.593431 4.494436e-10 0.107011\n", "\n", "[1187 rows x 5 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sample_prediction_df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" }, "papermill": { "default_parameters": {}, "duration": 3924.98772, "end_time": "2021-07-26T11:32:07.079842", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2021-07-26T10:26:42.092122", "version": "2.3.3" } }, "nbformat": 4, "nbformat_minor": 5 }