{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Expected Goals Model\n",
"\n",
"Load in pickled preprocessed data, separate into train and test split and train Logistic Regression"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:17:37.364283Z",
"start_time": "2021-01-03T11:17:34.480145Z"
}
},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import json\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import FCPython\n",
"\n",
"import pickle\n",
"import statsmodels.api as sm\n",
"import statsmodels.formula.api as smf\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"import Metrica_Functions_TLMAnalytics as mfun"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preprocess StatsBomb Event Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load Events"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:18:25.439347Z",
"start_time": "2021-01-03T11:18:24.009193Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[16, 43, 11, 2]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Get competition ids for relevant competitions\n",
"comps = pd.read_json('open-data-master/data/competitions.json')\n",
"male_comps = comps[comps['competition_gender'] == 'male']\n",
"male_comps_id = list(male_comps['competition_id'].unique())\n",
"male_comps_id"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:18:30.120116Z",
"start_time": "2021-01-03T11:18:29.265432Z"
}
},
"outputs": [],
"source": [
"# For those competitions, identify the match ids\n",
"male_df_list = []\n",
"for comp in male_comps_id:\n",
" json_files = [x for x in os.listdir('open-data-master/data/matches/' + str(comp)) if 'json' in x]\n",
" for event_json in json_files:\n",
" male_df = pd.read_json('open-data-master/data/matches/' + str(comp) + '/' + event_json)\n",
" male_df_list.append(male_df)\n",
"\n",
"male_df = pd.concat(male_df_list)\n",
"male_matches = list(male_df['match_id'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:21:42.967968Z",
"start_time": "2021-01-03T11:18:41.920269Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading: 15946. 1 / 596\n",
"Loading: 15956. 2 / 596\n",
"Loading: 15973. 3 / 596\n",
"Loading: 15978. 4 / 596\n",
"Loading: 15986. 5 / 596\n",
"Loading: 15998. 6 / 596\n",
"Loading: 16010. 7 / 596\n",
"Loading: 16023. 8 / 596\n",
"Loading: 16029. 9 / 596\n",
"Loading: 16056. 10 / 596\n",
"Loading: 16073. 11 / 596\n",
"Loading: 16079. 12 / 596\n",
"Loading: 16086. 13 / 596\n",
"Loading: 16095. 14 / 596\n",
"Loading: 16109. 15 / 596\n",
"Loading: 16120. 16 / 596\n",
"Loading: 16131. 17 / 596\n",
"Loading: 16136. 18 / 596\n",
"Loading: 16149. 19 / 596\n",
"Loading: 16157. 20 / 596\n",
"Loading: 16173. 21 / 596\n",
"Loading: 16182. 22 / 596\n",
"Loading: 16190. 23 / 596\n",
"Loading: 16196. 24 / 596\n",
"Loading: 16205. 25 / 596\n",
"Loading: 16215. 26 / 596\n",
"Loading: 16231. 27 / 596\n",
"Loading: 16240. 28 / 596\n",
"Loading: 16248. 29 / 596\n",
"Loading: 16265. 30 / 596\n",
"Loading: 16275. 31 / 596\n",
"Loading: 16289. 32 / 596\n",
"Loading: 16306. 33 / 596\n",
"Loading: 16317. 34 / 596\n",
"Loading: 18235. 35 / 596\n",
"Loading: 18236. 36 / 596\n",
"Loading: 18237. 37 / 596\n",
"Loading: 18240. 38 / 596\n",
"Loading: 18241. 39 / 596\n",
"Loading: 18242. 40 / 596\n",
"Loading: 18243. 41 / 596\n",
"Loading: 18244. 42 / 596\n",
"Loading: 18245. 43 / 596\n",
"Loading: 22912. 44 / 596\n",
"Loading: 2302764. 45 / 596\n",
"Loading: 265830. 46 / 596\n",
"Loading: 265835. 47 / 596\n",
"Loading: 265837. 48 / 596\n",
"Loading: 265839. 49 / 596\n",
"Loading: 265857. 50 / 596\n",
"Loading: 265866. 51 / 596\n",
"Loading: 265894. 52 / 596\n",
"Loading: 265896. 53 / 596\n",
"Loading: 265918. 54 / 596\n",
"Loading: 265944. 55 / 596\n",
"Loading: 265952. 56 / 596\n",
"Loading: 265958. 57 / 596\n",
"Loading: 265963. 58 / 596\n",
"Loading: 266015. 59 / 596\n",
"Loading: 266033. 60 / 596\n",
"Loading: 266045. 61 / 596\n",
"Loading: 266056. 62 / 596\n",
"Loading: 266066. 63 / 596\n",
"Loading: 266074. 64 / 596\n",
"Loading: 266106. 65 / 596\n",
"Loading: 266117. 66 / 596\n",
"Loading: 266142. 67 / 596\n",
"Loading: 266148. 68 / 596\n",
"Loading: 266149. 69 / 596\n",
"Loading: 266160. 70 / 596\n",
"Loading: 266166. 71 / 596\n",
"Loading: 266191. 72 / 596\n",
"Loading: 266201. 73 / 596\n",
"Loading: 266230. 74 / 596\n",
"Loading: 266236. 75 / 596\n",
"Loading: 266240. 76 / 596\n",
"Loading: 266254. 77 / 596\n",
"Loading: 266256. 78 / 596\n",
"Loading: 266273. 79 / 596\n",
"Loading: 266274. 80 / 596\n",
"Loading: 266280. 81 / 596\n",
"Loading: 266299. 82 / 596\n",
"Loading: 266310. 83 / 596\n",
"Loading: 266320. 84 / 596\n",
"Loading: 266357. 85 / 596\n",
"Loading: 266406. 86 / 596\n",
"Loading: 266420. 87 / 596\n",
"Loading: 266424. 88 / 596\n",
"Loading: 266433. 89 / 596\n",
"Loading: 266440. 90 / 596\n",
"Loading: 266462. 91 / 596\n",
"Loading: 266467. 92 / 596\n",
"Loading: 266477. 93 / 596\n",
"Loading: 266490. 94 / 596\n",
"Loading: 266491. 95 / 596\n",
"Loading: 266498. 96 / 596\n",
"Loading: 266516. 97 / 596\n",
"Loading: 266525. 98 / 596\n",
"Loading: 266528. 99 / 596\n",
"Loading: 266531. 100 / 596\n",
"Loading: 266557. 101 / 596\n",
"Loading: 266560. 102 / 596\n",
"Loading: 266603. 103 / 596\n",
"Loading: 266613. 104 / 596\n",
"Loading: 266620. 105 / 596\n",
"Loading: 266631. 106 / 596\n",
"Loading: 266653. 107 / 596\n",
"Loading: 266664. 108 / 596\n",
"Loading: 266669. 109 / 596\n",
"Loading: 266670. 110 / 596\n",
"Loading: 266724. 111 / 596\n",
"Loading: 266731. 112 / 596\n",
"Loading: 266741. 113 / 596\n",
"Loading: 266770. 114 / 596\n",
"Loading: 266794. 115 / 596\n",
"Loading: 266815. 116 / 596\n",
"Loading: 266827. 117 / 596\n",
"Loading: 266838. 118 / 596\n",
"Loading: 266846. 119 / 596\n",
"Loading: 266871. 120 / 596\n",
"Loading: 266874. 121 / 596\n",
"Loading: 266883. 122 / 596\n",
"Loading: 266885. 123 / 596\n",
"Loading: 266892. 124 / 596\n",
"Loading: 266916. 125 / 596\n",
"Loading: 266921. 126 / 596\n",
"Loading: 266929. 127 / 596\n",
"Loading: 266952. 128 / 596\n",
"Loading: 266961. 129 / 596\n",
"Loading: 266967. 130 / 596\n",
"Loading: 266986. 131 / 596\n",
"Loading: 266989. 132 / 596\n",
"Loading: 267039. 133 / 596\n",
"Loading: 267058. 134 / 596\n",
"Loading: 267076. 135 / 596\n",
"Loading: 267077. 136 / 596\n",
"Loading: 267085. 137 / 596\n",
"Loading: 267101. 138 / 596\n",
"Loading: 267138. 139 / 596\n",
"Loading: 267183. 140 / 596\n",
"Loading: 267192. 141 / 596\n",
"Loading: 267197. 142 / 596\n",
"Loading: 267212. 143 / 596\n",
"Loading: 267220. 144 / 596\n",
"Loading: 267273. 145 / 596\n",
"Loading: 267274. 146 / 596\n",
"Loading: 267301. 147 / 596\n",
"Loading: 267327. 148 / 596\n",
"Loading: 267343. 149 / 596\n",
"Loading: 267368. 150 / 596\n",
"Loading: 267373. 151 / 596\n",
"Loading: 267395. 152 / 596\n",
"Loading: 267400. 153 / 596\n",
"Loading: 267422. 154 / 596\n",
"Loading: 267432. 155 / 596\n",
"Loading: 267464. 156 / 596\n",
"Loading: 267492. 157 / 596\n",
"Loading: 267499. 158 / 596\n",
"Loading: 267502. 159 / 596\n",
"Loading: 267506. 160 / 596\n",
"Loading: 267520. 161 / 596\n",
"Loading: 267533. 162 / 596\n",
"Loading: 267561. 163 / 596\n",
"Loading: 267567. 164 / 596\n",
"Loading: 267569. 165 / 596\n",
"Loading: 267576. 166 / 596\n",
"Loading: 267590. 167 / 596\n",
"Loading: 267596. 168 / 596\n",
"Loading: 267597. 169 / 596\n",
"Loading: 267611. 170 / 596\n",
"Loading: 267660. 171 / 596\n",
"Loading: 267670. 172 / 596\n",
"Loading: 267675. 173 / 596\n",
"Loading: 303377. 174 / 596\n",
"Loading: 303400. 175 / 596\n",
"Loading: 303421. 176 / 596\n",
"Loading: 303430. 177 / 596\n",
"Loading: 303451. 178 / 596\n",
"Loading: 303470. 179 / 596\n",
"Loading: 303473. 180 / 596\n",
"Loading: 303479. 181 / 596\n",
"Loading: 303487. 182 / 596\n",
"Loading: 303493. 183 / 596\n",
"Loading: 303504. 184 / 596\n",
"Loading: 303516. 185 / 596\n",
"Loading: 303517. 186 / 596\n",
"Loading: 303524. 187 / 596\n",
"Loading: 303532. 188 / 596\n",
"Loading: 303548. 189 / 596\n",
"Loading: 303596. 190 / 596\n",
"Loading: 303600. 191 / 596\n",
"Loading: 303610. 192 / 596\n",
"Loading: 303615. 193 / 596\n",
"Loading: 303634. 194 / 596\n",
"Loading: 303652. 195 / 596\n",
"Loading: 303664. 196 / 596\n",
"Loading: 303666. 197 / 596\n",
"Loading: 303674. 198 / 596\n",
"Loading: 303680. 199 / 596\n",
"Loading: 303682. 200 / 596\n",
"Loading: 303696. 201 / 596\n",
"Loading: 303700. 202 / 596\n",
"Loading: 303707. 203 / 596\n",
"Loading: 303715. 204 / 596\n",
"Loading: 303725. 205 / 596\n",
"Loading: 303731. 206 / 596\n",
"Loading: 3749052. 207 / 596\n",
"Loading: 3749068. 208 / 596\n",
"Loading: 3749079. 209 / 596\n",
"Loading: 3749133. 210 / 596\n",
"Loading: 3749153. 211 / 596\n",
"Loading: 3749192. 212 / 596\n",
"Loading: 3749196. 213 / 596\n",
"Loading: 3749233. 214 / 596\n",
"Loading: 3749246. 215 / 596\n",
"Loading: 3749253. 216 / 596\n",
"Loading: 3749257. 217 / 596\n",
"Loading: 3749276. 218 / 596\n",
"Loading: 3749278. 219 / 596\n",
"Loading: 3749296. 220 / 596\n",
"Loading: 3749310. 221 / 596\n",
"Loading: 3749346. 222 / 596\n",
"Loading: 3749358. 223 / 596\n",
"Loading: 3749360. 224 / 596\n",
"Loading: 3749403. 225 / 596\n",
"Loading: 3749431. 226 / 596\n",
"Loading: 3749434. 227 / 596\n",
"Loading: 3749448. 228 / 596\n",
"Loading: 3749453. 229 / 596\n",
"Loading: 3749454. 230 / 596\n",
"Loading: 3749462. 231 / 596\n",
"Loading: 3749465. 232 / 596\n",
"Loading: 3749493. 233 / 596\n",
"Loading: 3749522. 234 / 596\n",
"Loading: 3749526. 235 / 596\n",
"Loading: 3749528. 236 / 596\n",
"Loading: 3749552. 237 / 596\n",
"Loading: 3749603. 238 / 596\n",
"Loading: 3749642. 239 / 596\n",
"Loading: 3750200. 240 / 596\n",
"Loading: 3750201. 241 / 596\n",
"Loading: 3752619. 242 / 596\n",
"Loading: 68313. 243 / 596\n",
"Loading: 68314. 244 / 596\n",
"Loading: 68315. 245 / 596\n",
"Loading: 68316. 246 / 596\n",
"Loading: 68317. 247 / 596\n",
"Loading: 68318. 248 / 596\n",
"Loading: 68319. 249 / 596\n",
"Loading: 68320. 250 / 596\n",
"Loading: 68321. 251 / 596\n",
"Loading: 68322. 252 / 596\n",
"Loading: 68323. 253 / 596\n",
"Loading: 68324. 254 / 596\n",
"Loading: 68325. 255 / 596\n",
"Loading: 68326. 256 / 596\n",
"Loading: 68327. 257 / 596\n",
"Loading: 68328. 258 / 596\n",
"Loading: 68329. 259 / 596\n",
"Loading: 68330. 260 / 596\n",
"Loading: 68331. 261 / 596\n",
"Loading: 68332. 262 / 596\n",
"Loading: 68333. 263 / 596\n",
"Loading: 68334. 264 / 596\n",
"Loading: 68335. 265 / 596\n",
"Loading: 68336. 266 / 596\n",
"Loading: 68339. 267 / 596\n",
"Loading: 68340. 268 / 596\n",
"Loading: 68341. 269 / 596\n",
"Loading: 68342. 270 / 596\n",
"Loading: 68347. 271 / 596\n",
"Loading: 68348. 272 / 596\n",
"Loading: 68350. 273 / 596\n",
"Loading: 68351. 274 / 596\n",
"Loading: 68352. 275 / 596\n",
"Loading: 68353. 276 / 596\n",
"Loading: 68354. 277 / 596\n",
"Loading: 68356. 278 / 596\n",
"Loading: 68358. 279 / 596\n",
"Loading: 68359. 280 / 596\n",
"Loading: 68360. 281 / 596\n",
"Loading: 68361. 282 / 596\n",
"Loading: 68363. 283 / 596\n",
"Loading: 68364. 284 / 596\n",
"Loading: 68365. 285 / 596\n",
"Loading: 68366. 286 / 596\n",
"Loading: 69138. 287 / 596\n",
"Loading: 69139. 288 / 596\n",
"Loading: 69141. 289 / 596\n",
"Loading: 69142. 290 / 596\n",
"Loading: 69143. 291 / 596\n",
"Loading: 69144. 292 / 596\n",
"Loading: 69145. 293 / 596\n",
"Loading: 69146. 294 / 596\n",
"Loading: 69147. 295 / 596\n",
"Loading: 69148. 296 / 596\n",
"Loading: 69149. 297 / 596\n",
"Loading: 69151. 298 / 596\n",
"Loading: 69153. 299 / 596\n",
"Loading: 69154. 300 / 596\n",
"Loading: 69155. 301 / 596\n",
"Loading: 69156. 302 / 596\n",
"Loading: 69157. 303 / 596\n",
"Loading: 69158. 304 / 596\n",
"Loading: 69159. 305 / 596\n",
"Loading: 69160. 306 / 596\n",
"Loading: 69162. 307 / 596\n",
"Loading: 69164. 308 / 596\n",
"Loading: 69165. 309 / 596\n",
"Loading: 69166. 310 / 596\n",
"Loading: 69169. 311 / 596\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading: 69170. 312 / 596\n",
"Loading: 69171. 313 / 596\n",
"Loading: 69172. 314 / 596\n",
"Loading: 69173. 315 / 596\n",
"Loading: 69174. 316 / 596\n",
"Loading: 69175. 317 / 596\n",
"Loading: 69176. 318 / 596\n",
"Loading: 69177. 319 / 596\n",
"Loading: 69178. 320 / 596\n",
"Loading: 69179. 321 / 596\n",
"Loading: 69180. 322 / 596\n",
"Loading: 69181. 323 / 596\n",
"Loading: 69182. 324 / 596\n",
"Loading: 69183. 325 / 596\n",
"Loading: 69184. 326 / 596\n",
"Loading: 69185. 327 / 596\n",
"Loading: 69186. 328 / 596\n",
"Loading: 69187. 329 / 596\n",
"Loading: 69189. 330 / 596\n",
"Loading: 69195. 331 / 596\n",
"Loading: 69207. 332 / 596\n",
"Loading: 69209. 333 / 596\n",
"Loading: 69210. 334 / 596\n",
"Loading: 69211. 335 / 596\n",
"Loading: 69212. 336 / 596\n",
"Loading: 69213. 337 / 596\n",
"Loading: 69214. 338 / 596\n",
"Loading: 69215. 339 / 596\n",
"Loading: 69216. 340 / 596\n",
"Loading: 69217. 341 / 596\n",
"Loading: 69218. 342 / 596\n",
"Loading: 69219. 343 / 596\n",
"Loading: 69220. 344 / 596\n",
"Loading: 69221. 345 / 596\n",
"Loading: 69222. 346 / 596\n",
"Loading: 69223. 347 / 596\n",
"Loading: 69224. 348 / 596\n",
"Loading: 69225. 349 / 596\n",
"Loading: 69226. 350 / 596\n",
"Loading: 69227. 351 / 596\n",
"Loading: 69228. 352 / 596\n",
"Loading: 69229. 353 / 596\n",
"Loading: 69230. 354 / 596\n",
"Loading: 69231. 355 / 596\n",
"Loading: 69232. 356 / 596\n",
"Loading: 69233. 357 / 596\n",
"Loading: 69234. 358 / 596\n",
"Loading: 69235. 359 / 596\n",
"Loading: 69236. 360 / 596\n",
"Loading: 69237. 361 / 596\n",
"Loading: 69238. 362 / 596\n",
"Loading: 69239. 363 / 596\n",
"Loading: 69240. 364 / 596\n",
"Loading: 69241. 365 / 596\n",
"Loading: 69242. 366 / 596\n",
"Loading: 69243. 367 / 596\n",
"Loading: 69244. 368 / 596\n",
"Loading: 69245. 369 / 596\n",
"Loading: 69246. 370 / 596\n",
"Loading: 69247. 371 / 596\n",
"Loading: 69248. 372 / 596\n",
"Loading: 69249. 373 / 596\n",
"Loading: 69250. 374 / 596\n",
"Loading: 69251. 375 / 596\n",
"Loading: 69252. 376 / 596\n",
"Loading: 69253. 377 / 596\n",
"Loading: 69254. 378 / 596\n",
"Loading: 69255. 379 / 596\n",
"Loading: 69256. 380 / 596\n",
"Loading: 69257. 381 / 596\n",
"Loading: 69259. 382 / 596\n",
"Loading: 69260. 383 / 596\n",
"Loading: 69262. 384 / 596\n",
"Loading: 69263. 385 / 596\n",
"Loading: 69264. 386 / 596\n",
"Loading: 69265. 387 / 596\n",
"Loading: 69267. 388 / 596\n",
"Loading: 69268. 389 / 596\n",
"Loading: 69269. 390 / 596\n",
"Loading: 69270. 391 / 596\n",
"Loading: 69271. 392 / 596\n",
"Loading: 69272. 393 / 596\n",
"Loading: 69273. 394 / 596\n",
"Loading: 69274. 395 / 596\n",
"Loading: 69275. 396 / 596\n",
"Loading: 69276. 397 / 596\n",
"Loading: 69277. 398 / 596\n",
"Loading: 69278. 399 / 596\n",
"Loading: 69279. 400 / 596\n",
"Loading: 69280. 401 / 596\n",
"Loading: 69282. 402 / 596\n",
"Loading: 69283. 403 / 596\n",
"Loading: 69285. 404 / 596\n",
"Loading: 69286. 405 / 596\n",
"Loading: 69287. 406 / 596\n",
"Loading: 69288. 407 / 596\n",
"Loading: 69289. 408 / 596\n",
"Loading: 69291. 409 / 596\n",
"Loading: 69292. 410 / 596\n",
"Loading: 69293. 411 / 596\n",
"Loading: 69295. 412 / 596\n",
"Loading: 69296. 413 / 596\n",
"Loading: 69297. 414 / 596\n",
"Loading: 69298. 415 / 596\n",
"Loading: 69299. 416 / 596\n",
"Loading: 69300. 417 / 596\n",
"Loading: 69302. 418 / 596\n",
"Loading: 69303. 419 / 596\n",
"Loading: 69304. 420 / 596\n",
"Loading: 69305. 421 / 596\n",
"Loading: 69306. 422 / 596\n",
"Loading: 69307. 423 / 596\n",
"Loading: 69308. 424 / 596\n",
"Loading: 69312. 425 / 596\n",
"Loading: 69314. 426 / 596\n",
"Loading: 69315. 427 / 596\n",
"Loading: 69316. 428 / 596\n",
"Loading: 69318. 429 / 596\n",
"Loading: 69319. 430 / 596\n",
"Loading: 69320. 431 / 596\n",
"Loading: 69322. 432 / 596\n",
"Loading: 69323. 433 / 596\n",
"Loading: 69324. 434 / 596\n",
"Loading: 69325. 435 / 596\n",
"Loading: 69326. 436 / 596\n",
"Loading: 69327. 437 / 596\n",
"Loading: 69328. 438 / 596\n",
"Loading: 69329. 439 / 596\n",
"Loading: 69330. 440 / 596\n",
"Loading: 69331. 441 / 596\n",
"Loading: 69332. 442 / 596\n",
"Loading: 69333. 443 / 596\n",
"Loading: 69334. 444 / 596\n",
"Loading: 69335. 445 / 596\n",
"Loading: 69336. 446 / 596\n",
"Loading: 69337. 447 / 596\n",
"Loading: 69338. 448 / 596\n",
"Loading: 69340. 449 / 596\n",
"Loading: 69343. 450 / 596\n",
"Loading: 70219. 451 / 596\n",
"Loading: 70220. 452 / 596\n",
"Loading: 70221. 453 / 596\n",
"Loading: 70223. 454 / 596\n",
"Loading: 70224. 455 / 596\n",
"Loading: 70225. 456 / 596\n",
"Loading: 70256. 457 / 596\n",
"Loading: 70259. 458 / 596\n",
"Loading: 70260. 459 / 596\n",
"Loading: 70262. 460 / 596\n",
"Loading: 70263. 461 / 596\n",
"Loading: 70264. 462 / 596\n",
"Loading: 70270. 463 / 596\n",
"Loading: 70271. 464 / 596\n",
"Loading: 70272. 465 / 596\n",
"Loading: 70273. 466 / 596\n",
"Loading: 70275. 467 / 596\n",
"Loading: 70276. 468 / 596\n",
"Loading: 70277. 469 / 596\n",
"Loading: 70280. 470 / 596\n",
"Loading: 70281. 471 / 596\n",
"Loading: 70282. 472 / 596\n",
"Loading: 70283. 473 / 596\n",
"Loading: 70284. 474 / 596\n",
"Loading: 70286. 475 / 596\n",
"Loading: 70287. 476 / 596\n",
"Loading: 70288. 477 / 596\n",
"Loading: 70289. 478 / 596\n",
"Loading: 70291. 479 / 596\n",
"Loading: 70292. 480 / 596\n",
"Loading: 70293. 481 / 596\n",
"Loading: 70294. 482 / 596\n",
"Loading: 70295. 483 / 596\n",
"Loading: 70296. 484 / 596\n",
"Loading: 70297. 485 / 596\n",
"Loading: 70298. 486 / 596\n",
"Loading: 70300. 487 / 596\n",
"Loading: 70301. 488 / 596\n",
"Loading: 70302. 489 / 596\n",
"Loading: 70303. 490 / 596\n",
"Loading: 70304. 491 / 596\n",
"Loading: 70305. 492 / 596\n",
"Loading: 70306. 493 / 596\n",
"Loading: 70307. 494 / 596\n",
"Loading: 70308. 495 / 596\n",
"Loading: 70309. 496 / 596\n",
"Loading: 7525. 497 / 596\n",
"Loading: 7529. 498 / 596\n",
"Loading: 7530. 499 / 596\n",
"Loading: 7531. 500 / 596\n",
"Loading: 7532. 501 / 596\n",
"Loading: 7533. 502 / 596\n",
"Loading: 7534. 503 / 596\n",
"Loading: 7535. 504 / 596\n",
"Loading: 7536. 505 / 596\n",
"Loading: 7537. 506 / 596\n",
"Loading: 7538. 507 / 596\n",
"Loading: 7539. 508 / 596\n",
"Loading: 7540. 509 / 596\n",
"Loading: 7541. 510 / 596\n",
"Loading: 7542. 511 / 596\n",
"Loading: 7543. 512 / 596\n",
"Loading: 7544. 513 / 596\n",
"Loading: 7545. 514 / 596\n",
"Loading: 7546. 515 / 596\n",
"Loading: 7547. 516 / 596\n",
"Loading: 7548. 517 / 596\n",
"Loading: 7549. 518 / 596\n",
"Loading: 7550. 519 / 596\n",
"Loading: 7551. 520 / 596\n",
"Loading: 7552. 521 / 596\n",
"Loading: 7553. 522 / 596\n",
"Loading: 7554. 523 / 596\n",
"Loading: 7555. 524 / 596\n",
"Loading: 7556. 525 / 596\n",
"Loading: 7557. 526 / 596\n",
"Loading: 7558. 527 / 596\n",
"Loading: 7559. 528 / 596\n",
"Loading: 7560. 529 / 596\n",
"Loading: 7561. 530 / 596\n",
"Loading: 7562. 531 / 596\n",
"Loading: 7563. 532 / 596\n",
"Loading: 7564. 533 / 596\n",
"Loading: 7565. 534 / 596\n",
"Loading: 7566. 535 / 596\n",
"Loading: 7567. 536 / 596\n",
"Loading: 7568. 537 / 596\n",
"Loading: 7569. 538 / 596\n",
"Loading: 7570. 539 / 596\n",
"Loading: 7571. 540 / 596\n",
"Loading: 7572. 541 / 596\n",
"Loading: 7576. 542 / 596\n",
"Loading: 7577. 543 / 596\n",
"Loading: 7578. 544 / 596\n",
"Loading: 7579. 545 / 596\n",
"Loading: 7580. 546 / 596\n",
"Loading: 7581. 547 / 596\n",
"Loading: 7582. 548 / 596\n",
"Loading: 7583. 549 / 596\n",
"Loading: 7584. 550 / 596\n",
"Loading: 7585. 551 / 596\n",
"Loading: 7586. 552 / 596\n",
"Loading: 8649. 553 / 596\n",
"Loading: 8650. 554 / 596\n",
"Loading: 8651. 555 / 596\n",
"Loading: 8652. 556 / 596\n",
"Loading: 8655. 557 / 596\n",
"Loading: 8656. 558 / 596\n",
"Loading: 8657. 559 / 596\n",
"Loading: 8658. 560 / 596\n",
"Loading: 9575. 561 / 596\n",
"Loading: 9581. 562 / 596\n",
"Loading: 9592. 563 / 596\n",
"Loading: 9602. 564 / 596\n",
"Loading: 9609. 565 / 596\n",
"Loading: 9620. 566 / 596\n",
"Loading: 9636. 567 / 596\n",
"Loading: 9642. 568 / 596\n",
"Loading: 9650. 569 / 596\n",
"Loading: 9661. 570 / 596\n",
"Loading: 9673. 571 / 596\n",
"Loading: 9682. 572 / 596\n",
"Loading: 9695. 573 / 596\n",
"Loading: 9700. 574 / 596\n",
"Loading: 9717. 575 / 596\n",
"Loading: 9726. 576 / 596\n",
"Loading: 9736. 577 / 596\n",
"Loading: 9742. 578 / 596\n",
"Loading: 9754. 579 / 596\n",
"Loading: 9765. 580 / 596\n",
"Loading: 9774. 581 / 596\n",
"Loading: 9783. 582 / 596\n",
"Loading: 9794. 583 / 596\n",
"Loading: 9799. 584 / 596\n",
"Loading: 9811. 585 / 596\n",
"Loading: 9827. 586 / 596\n",
"Loading: 9837. 587 / 596\n",
"Loading: 9855. 588 / 596\n",
"Loading: 9860. 589 / 596\n",
"Loading: 9870. 590 / 596\n",
"Loading: 9880. 591 / 596\n",
"Loading: 9889. 592 / 596\n",
"Loading: 9912. 593 / 596\n",
"Loading: 9924. 594 / 596\n",
"Loading: 9928. 595 / 596\n",
"Loading: 9948. 596 / 596\n"
]
}
],
"source": [
"# For those matches, load the events and append to single dataframe\n",
"json_files = [int(x.split('.')[0]) for x in os.listdir('open-data-master/data/events') if 'json' in x]\n",
"male_matches = [x for x in json_files if x in male_matches]\n",
"df_list = []\n",
"count = 1\n",
"for event_json in male_matches:\n",
" print('Loading: {}. {} / {}'.format(event_json, count, len(male_matches)))\n",
" df = pd.read_json('open-data-master/data/events/' + str(event_json) + '.json')\n",
" df['match_id'] = event_json\n",
" df_list.append(df)\n",
" count +=1\n",
" \n",
"df = pd.concat(df_list)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Filter Shot Events"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:21:51.925176Z",
"start_time": "2021-01-03T11:21:42.970933Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" index | \n",
" period | \n",
" timestamp | \n",
" minute | \n",
" second | \n",
" type | \n",
" possession | \n",
" possession_team | \n",
" play_pattern | \n",
" ... | \n",
" match_id | \n",
" clearance | \n",
" off_camera | \n",
" miscontrol | \n",
" 50_50 | \n",
" out | \n",
" injury_stoppage | \n",
" half_start | \n",
" player_off | \n",
" half_end | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 65f16e50-7c5d-4293-b2fc-d20887a772f9 | \n",
" 148 | \n",
" 1 | \n",
" 2021-01-03 00:02:29.094 | \n",
" 2 | \n",
" 29 | \n",
" {'id': 16, 'name': 'Shot'} | \n",
" 6 | \n",
" {'id': 217, 'name': 'Barcelona'} | \n",
" {'id': 1, 'name': 'Regular Play'} | \n",
" ... | \n",
" 15946 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" b0f73423-3990-45ae-9dda-3512c2d1aff3 | \n",
" 283 | \n",
" 1 | \n",
" 2021-01-03 00:05:39.239 | \n",
" 5 | \n",
" 39 | \n",
" {'id': 16, 'name': 'Shot'} | \n",
" 11 | \n",
" {'id': 217, 'name': 'Barcelona'} | \n",
" {'id': 1, 'name': 'Regular Play'} | \n",
" ... | \n",
" 15946 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 13b1ddab-d22e-43d9-bfe4-12632fea1a27 | \n",
" 755 | \n",
" 1 | \n",
" 2021-01-03 00:15:28.625 | \n",
" 15 | \n",
" 28 | \n",
" {'id': 16, 'name': 'Shot'} | \n",
" 26 | \n",
" {'id': 217, 'name': 'Barcelona'} | \n",
" {'id': 8, 'name': 'From Keeper'} | \n",
" ... | \n",
" 15946 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
3 rows × 43 columns
\n",
"
"
],
"text/plain": [
" id index period \\\n",
"0 65f16e50-7c5d-4293-b2fc-d20887a772f9 148 1 \n",
"1 b0f73423-3990-45ae-9dda-3512c2d1aff3 283 1 \n",
"2 13b1ddab-d22e-43d9-bfe4-12632fea1a27 755 1 \n",
"\n",
" timestamp minute second type \\\n",
"0 2021-01-03 00:02:29.094 2 29 {'id': 16, 'name': 'Shot'} \n",
"1 2021-01-03 00:05:39.239 5 39 {'id': 16, 'name': 'Shot'} \n",
"2 2021-01-03 00:15:28.625 15 28 {'id': 16, 'name': 'Shot'} \n",
"\n",
" possession possession_team \\\n",
"0 6 {'id': 217, 'name': 'Barcelona'} \n",
"1 11 {'id': 217, 'name': 'Barcelona'} \n",
"2 26 {'id': 217, 'name': 'Barcelona'} \n",
"\n",
" play_pattern ... match_id clearance off_camera \\\n",
"0 {'id': 1, 'name': 'Regular Play'} ... 15946 NaN NaN \n",
"1 {'id': 1, 'name': 'Regular Play'} ... 15946 NaN NaN \n",
"2 {'id': 8, 'name': 'From Keeper'} ... 15946 NaN NaN \n",
"\n",
" miscontrol 50_50 out injury_stoppage half_start player_off half_end \n",
"0 NaN NaN NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN NaN NaN NaN \n",
"\n",
"[3 rows x 43 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shots = df[~df['shot'].isnull()]\n",
"shots.reset_index(drop = True, inplace = True)\n",
"shots.head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Expected Goal Features"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:30:41.273039Z",
"start_time": "2021-01-03T11:29:02.994836Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Ciaran\\AppData\\Roaming\\Python\\Python36\\site-packages\\ipykernel_launcher.py:36: RuntimeWarning: invalid value encountered in arccos\n"
]
}
],
"source": [
"left_post_x, left_post_y = (120, 36)\n",
"right_post_x, right_post_y = (120, 44)\n",
"\n",
"shots_model=pd.DataFrame(columns=['goal','x','y'])\n",
"for index, shot in shots.iterrows():\n",
" \n",
" # Keep only open play\n",
" open_play = 0\n",
" if shot['shot']['type']['name'] == 'Open Play':\n",
" open_play = 1\n",
"\n",
" # Keep non-headed shots\n",
" header = 0\n",
" if shot['shot']['body_part']['name'] == 'Head':\n",
" header = 1\n",
" if (open_play == 1) & (header == 0):\n",
" # Goal\n",
" if shot['shot']['outcome']['name'] == 'Goal':\n",
" shots_model.loc[index, 'goal'] = 1\n",
" else:\n",
" shots_model.loc[index, 'goal'] = 0\n",
"\n",
" # X, Y locations\n",
" shot_location_x, shot_location_y = shot['location']\n",
" shots_model.loc[index, 'x'] = shot_location_x\n",
" shots_model.loc[index, 'y'] = shot_location_y\n",
" shots_model.loc[index,'c']=abs(shot_location_y-40)\n",
"\n",
" # Distance to centre of goal and angle\n",
" shots_model.loc[index,'distance']=np.sqrt((120-shot_location_x)**2 + (40-shot_location_y)**2)\n",
" \n",
" # Angle to goal (radians)\n",
" a = np.sqrt((shot_location_x - right_post_x)**2 + (shot_location_y - right_post_y)**2)\n",
" b = np.sqrt((left_post_x - right_post_x)**2 + (left_post_y - right_post_y)**2)\n",
" c = np.sqrt((left_post_x - shot_location_x)**2 + (left_post_y - shot_location_y)**2)\n",
" angle_ac = np.arccos((a**2 + c**2 - b**2)/(2*a*c))\n",
" if angle_ac<0:\n",
" angle_ac=np.pi+angle_ac\n",
" shots_model.loc[index,'angle'] =angle_ac\n",
"\n",
" # Play pattern\n",
" shots_model.loc[index, 'play_pattern'] = shot['play_pattern']['name']\n",
"\n",
" # Body part\n",
" shots_model.loc[index, 'body_part'] = shot['shot']['body_part']['name']\n",
"\n",
" # First time\n",
" shots_model.loc[index, 'first_time'] = 0\n",
" if 'first_time' in shot['shot'].keys():\n",
" shots_model.loc[index, 'first_time'] = 1\n",
"\n",
" # Technique\n",
" shots_model.loc[index, 'technique'] = shot['shot']['technique']['name']\n",
"\n",
" # Pressure\n",
" shots_model.loc[index, 'under_pressure'] = 0\n",
" if shot['under_pressure'] == True:\n",
" shots_model.loc[index, 'under_pressure'] = 1\n",
"\n",
" # Freeze Frame - defender location\n",
" # Number of defenders between shot location and goal\n",
" # Distance to nearest defender between shot location and goal\n",
" # Distance to nearest defender\n",
" # TBD: position of nearest defender\n",
" # TBD: angle of goal left after removing blocked defenders (assume ~1m width)\n",
" \n",
" freeze_frame_def = []\n",
" for player in shot['shot']['freeze_frame']:\n",
" if player['teammate'] == False:\n",
" freeze_frame_def.append(player)\n",
" \n",
" distance_nearest_defender = None\n",
" distance_nearest_blocking_defender = None\n",
" blocking_defender = []\n",
" for defender in freeze_frame_def:\n",
" # defender specific\n",
" defender_x, defender_y = defender['location']\n",
" distance_defender = np.sqrt((shot_location_x - defender_x)**2 + (shot_location_y - defender_y)**2)\n",
" if distance_nearest_defender == None:\n",
" distance_nearest_defender = distance_defender\n",
" elif distance_defender < distance_nearest_defender:\n",
" distance_nearest_defender = distance_defender\n",
" distance_defender = None\n",
"\n",
" blocking = mfun.is_inside(shot_location_x, shot_location_y\n",
" , left_post_x, left_post_y\n",
" , right_post_x, right_post_y\n",
" , defender_x, defender_y)\n",
" if blocking == True:\n",
" # If defender is blocking part of the goal..\n",
" blocking_defender.append(defender)\n",
"\n",
" # Measure distance of blocking defender\n",
" distance_blocking_defender = np.sqrt((shot_location_x - defender_x)**2 + (shot_location_y - defender_y)**2)\n",
" \n",
" if distance_nearest_blocking_defender == None:\n",
" distance_nearest_blocking_defender = distance_blocking_defender\n",
" elif distance_blocking_defender < distance_nearest_blocking_defender:\n",
" distance_nearest_blocking_defender = distance_blocking_defender\n",
" distance_blocking_defender = None\n",
" \n",
" shots_model.loc[index, 'distance_nearest_defender'] = distance_nearest_defender\n",
" shots_model.loc[index, 'distance_nearest_blocking_defender'] = distance_nearest_blocking_defender\n",
" shots_model.loc[index, 'number_blocking_defenders'] = len(blocking_defender)\n",
"\n",
" \n",
" # Key Pass info\n",
" # TBD - get info from previous pass\n",
" # Eg. cross / through ball / where it was etc\n",
" \n",
" # StatsBomb xG\n",
" shots_model.loc[index, 'statsbomb_xg'] = shot['shot']['statsbomb_xg']\n",
"\n",
"# # Make locations numeric\n",
"# shots_model['x'] = pd.to_numeric(shots_model['x'])\n",
"# shots_model['y'] = pd.to_numeric(shots_model['y'])\n",
"\n",
"# # Try squared distances\n",
"# shots_model['d2'] = shots_model['distance']**2\n",
"# shots_model['x2'] = shots_model['x']**2\n",
"# shots_model['c2'] = shots_model['c']**2\n",
"# # Try angle * x location\n",
"# shots_model['ax'] = shots_model['angle']*shots_model['x']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Save Preprocessed Data"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:30:41.336836Z",
"start_time": "2021-01-03T11:30:41.275002Z"
}
},
"outputs": [],
"source": [
"shots_model.to_pickle(\"./shots_model.pkl\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Expected Goals Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load Shot Data"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:30:41.524375Z",
"start_time": "2021-01-03T11:30:41.337833Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" goal | \n",
" x | \n",
" y | \n",
" c | \n",
" distance | \n",
" angle | \n",
" play_pattern | \n",
" body_part | \n",
" first_time | \n",
" technique | \n",
" under_pressure | \n",
" distance_nearest_defender | \n",
" distance_nearest_blocking_defender | \n",
" number_blocking_defenders | \n",
" statsbomb_xg | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 111.7 | \n",
" 51.7 | \n",
" 11.7 | \n",
" 14.345034 | \n",
" 0.336567 | \n",
" Regular Play | \n",
" Right Foot | \n",
" 1.0 | \n",
" Half Volley | \n",
" 0.0 | \n",
" 1.303840 | \n",
" NaN | \n",
" 0.0 | \n",
" 0.075164 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 114 | \n",
" 27 | \n",
" 13.0 | \n",
" 14.317821 | \n",
" 0.248710 | \n",
" Regular Play | \n",
" Left Foot | \n",
" 1.0 | \n",
" Volley | \n",
" 0.0 | \n",
" 3.700000 | \n",
" NaN | \n",
" 0.0 | \n",
" 0.062892 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 92 | \n",
" 34.5 | \n",
" 5.5 | \n",
" 28.535066 | \n",
" 0.273578 | \n",
" From Keeper | \n",
" Left Foot | \n",
" 0.0 | \n",
" Normal | \n",
" 0.0 | \n",
" 2.884441 | \n",
" 5.124451 | \n",
" 1.0 | \n",
" 0.020535 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 107 | \n",
" 25 | \n",
" 15.0 | \n",
" 19.849433 | \n",
" 0.268489 | \n",
" From Corner | \n",
" Right Foot | \n",
" 0.0 | \n",
" Normal | \n",
" 0.0 | \n",
" 3.244996 | \n",
" NaN | \n",
" 0.0 | \n",
" 0.035420 | \n",
"
\n",
" \n",
" 5 | \n",
" 0 | \n",
" 108.1 | \n",
" 27.4 | \n",
" 12.6 | \n",
" 17.331186 | \n",
" 0.323048 | \n",
" Regular Play | \n",
" Left Foot | \n",
" 1.0 | \n",
" Half Volley | \n",
" 1.0 | \n",
" 2.039608 | \n",
" NaN | \n",
" 0.0 | \n",
" 0.089920 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" goal x y c distance angle play_pattern body_part \\\n",
"0 0 111.7 51.7 11.7 14.345034 0.336567 Regular Play Right Foot \n",
"1 0 114 27 13.0 14.317821 0.248710 Regular Play Left Foot \n",
"2 0 92 34.5 5.5 28.535066 0.273578 From Keeper Left Foot \n",
"4 0 107 25 15.0 19.849433 0.268489 From Corner Right Foot \n",
"5 0 108.1 27.4 12.6 17.331186 0.323048 Regular Play Left Foot \n",
"\n",
" first_time technique under_pressure distance_nearest_defender \\\n",
"0 1.0 Half Volley 0.0 1.303840 \n",
"1 1.0 Volley 0.0 3.700000 \n",
"2 0.0 Normal 0.0 2.884441 \n",
"4 0.0 Normal 0.0 3.244996 \n",
"5 1.0 Half Volley 1.0 2.039608 \n",
"\n",
" distance_nearest_blocking_defender number_blocking_defenders statsbomb_xg \n",
"0 NaN 0.0 0.075164 \n",
"1 NaN 0.0 0.062892 \n",
"2 5.124451 1.0 0.020535 \n",
"4 NaN 0.0 0.035420 \n",
"5 NaN 0.0 0.089920 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Load shot model\n",
"shots_model = pd.read_pickle(\"./shots_model.pkl\")\n",
"shots_model.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Separate into Train/Test Data"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:31:02.677856Z",
"start_time": "2021-01-03T11:31:02.655917Z"
}
},
"outputs": [],
"source": [
"# Separate into train, test data for modelling\n",
"X_train, X_test, y_train, y_test = train_test_split(shots_model.loc[:, shots_model.columns != 'goal']\n",
" , shots_model.loc[:, shots_model.columns == 'goal']\n",
" , test_size=0.2, random_state=42)\n",
"train = pd.concat([y_train, X_train], axis=1)\n",
"test = pd.concat([y_test, X_test], axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Logistic Regression"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:31:04.578548Z",
"start_time": "2021-01-03T11:31:04.503376Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==================================================================================\n",
"Dep. Variable: ['goal[0]', 'goal[1]'] No. Observations: 9295\n",
"Model: GLM Df Residuals: 9290\n",
"Model Family: Binomial Df Model: 4\n",
"Link Function: logit Scale: 1.0000\n",
"Method: IRLS Log-Likelihood: -2901.7\n",
"Date: Sun, 03 Jan 2021 Deviance: 5803.5\n",
"Time: 11:31:04 Pearson chi2: 9.45e+03\n",
"No. Iterations: 6 \n",
"Covariance Type: nonrobust \n",
"=============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------------\n",
"Intercept 1.0519 0.205 5.122 0.000 0.649 1.454\n",
"distance 0.1080 0.008 12.989 0.000 0.092 0.124\n",
"angle -1.6109 0.181 -8.917 0.000 -1.965 -1.257\n",
"distance_nearest_defender -0.1242 0.021 -5.858 0.000 -0.166 -0.083\n",
"number_blocking_defenders 0.3260 0.054 6.053 0.000 0.220 0.432\n",
"=============================================================================================\n"
]
}
],
"source": [
"# A GLM for fitting goal probability\n",
"model_variables = ['distance'\n",
" , 'angle'\n",
" , 'distance_nearest_defender'\n",
" , 'number_blocking_defenders'\n",
" ]\n",
"\n",
"model=''\n",
"for v in model_variables[:-1]:\n",
" model = model + v + ' + '\n",
"model = model + model_variables[-1]\n",
"\n",
"#Fit the model\n",
"xG_model = smf.glm(formula=\"goal ~ \" + model, data=train, \n",
" family=sm.families.Binomial()).fit()\n",
"print(xG_model.summary()) \n",
"xG_model_params=xG_model.params"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Calculate xG on Test data"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:31:17.119822Z",
"start_time": "2021-01-03T11:31:17.111841Z"
}
},
"outputs": [],
"source": [
"# Calculate xG for GLM using each shot as input (row of shots_model)\n",
"def calculate_xG(sh):\n",
" # For the model 'b', get the intercept\n",
" bsum=xG_model_params[0]\n",
" # For as many variables as put in the model, \n",
" # multiply the coefficient by the value of that shot.\n",
" for i,v in enumerate(model_variables):\n",
" # bsum = intercept + (coefficient * variable value)\n",
" bsum=bsum+xG_model_params[i+1]*sh[v]\n",
" # Calculate probability of goal as 1 / 1 + exp(model output)\n",
" xG = 1/(1+np.exp(bsum)) \n",
" return xG "
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:31:17.727181Z",
"start_time": "2021-01-03T11:31:17.368141Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" goal | \n",
" x | \n",
" y | \n",
" c | \n",
" distance | \n",
" angle | \n",
" play_pattern | \n",
" body_part | \n",
" first_time | \n",
" technique | \n",
" under_pressure | \n",
" distance_nearest_defender | \n",
" distance_nearest_blocking_defender | \n",
" number_blocking_defenders | \n",
" statsbomb_xg | \n",
" xG | \n",
"
\n",
" \n",
" \n",
" \n",
" 3580 | \n",
" 0 | \n",
" 102.5 | \n",
" 23.8 | \n",
" 16.2 | \n",
" 23.847222 | \n",
" 0.248088 | \n",
" From Corner | \n",
" Right Foot | \n",
" 0.0 | \n",
" Normal | \n",
" 0.0 | \n",
" 3.894868 | \n",
" NaN | \n",
" 0.0 | \n",
" 0.025732 | \n",
" 0.060465 | \n",
"
\n",
" \n",
" 13598 | \n",
" 0 | \n",
" 115 | \n",
" 26 | \n",
" 14.0 | \n",
" 14.866069 | \n",
" 0.192701 | \n",
" Regular Play | \n",
" Left Foot | \n",
" 1.0 | \n",
" Normal | \n",
" 0.0 | \n",
" 5.000000 | \n",
" 10.440307 | \n",
" 1.0 | \n",
" 0.052736 | \n",
" 0.113889 | \n",
"
\n",
" \n",
" 4691 | \n",
" 0 | \n",
" 102.1 | \n",
" 36.3 | \n",
" 3.7 | \n",
" 18.278403 | \n",
" 0.422998 | \n",
" Regular Play | \n",
" Left Foot | \n",
" 0.0 | \n",
" Half Volley | \n",
" 0.0 | \n",
" 1.664332 | \n",
" NaN | \n",
" 0.0 | \n",
" 0.047882 | \n",
" 0.105522 | \n",
"
\n",
" \n",
" 8187 | \n",
" 1 | \n",
" 112.5 | \n",
" 38.1 | \n",
" 1.9 | \n",
" 7.736924 | \n",
" 0.939567 | \n",
" From Goal Kick | \n",
" Right Foot | \n",
" 0.0 | \n",
" Normal | \n",
" 0.0 | \n",
" 1.204159 | \n",
" NaN | \n",
" 0.0 | \n",
" 0.354846 | \n",
" 0.444201 | \n",
"
\n",
" \n",
" 675 | \n",
" 0 | \n",
" 117.4 | \n",
" 30.7 | \n",
" 9.3 | \n",
" 9.656604 | \n",
" 0.263018 | \n",
" Regular Play | \n",
" Left Foot | \n",
" 0.0 | \n",
" Normal | \n",
" 0.0 | \n",
" 3.605551 | \n",
" NaN | \n",
" 0.0 | \n",
" 0.520617 | \n",
" 0.227407 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" goal x y c distance angle play_pattern \\\n",
"3580 0 102.5 23.8 16.2 23.847222 0.248088 From Corner \n",
"13598 0 115 26 14.0 14.866069 0.192701 Regular Play \n",
"4691 0 102.1 36.3 3.7 18.278403 0.422998 Regular Play \n",
"8187 1 112.5 38.1 1.9 7.736924 0.939567 From Goal Kick \n",
"675 0 117.4 30.7 9.3 9.656604 0.263018 Regular Play \n",
"\n",
" body_part first_time technique under_pressure \\\n",
"3580 Right Foot 0.0 Normal 0.0 \n",
"13598 Left Foot 1.0 Normal 0.0 \n",
"4691 Left Foot 0.0 Half Volley 0.0 \n",
"8187 Right Foot 0.0 Normal 0.0 \n",
"675 Left Foot 0.0 Normal 0.0 \n",
"\n",
" distance_nearest_defender distance_nearest_blocking_defender \\\n",
"3580 3.894868 NaN \n",
"13598 5.000000 10.440307 \n",
"4691 1.664332 NaN \n",
"8187 1.204159 NaN \n",
"675 3.605551 NaN \n",
"\n",
" number_blocking_defenders statsbomb_xg xG \n",
"3580 0.0 0.025732 0.060465 \n",
"13598 1.0 0.052736 0.113889 \n",
"4691 0.0 0.047882 0.105522 \n",
"8187 0.0 0.354846 0.444201 \n",
"675 0.0 0.520617 0.227407 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Add an xG to my dataframe\n",
"train_xG=train.apply(calculate_xG, axis=1)\n",
"train['xG'] = train_xG\n",
"\n",
"test_xG=test.apply(calculate_xG, axis=1)\n",
"test['xG'] = test_xG\n",
"test.tail()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Model Evaluation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Check (x, y) Probabilities by # Defenders and Distance to Nearest Defenders"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"ExecuteTime": {
"end_time": "2021-01-03T11:31:25.195303Z",
"start_time": "2021-01-03T11:31:18.264550Z"
},
"scrolled": false
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"