{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Positional Spending Data\n", "**Name:** Jaime AvendaƱo \n", "**Date:** 5/20/2021 \n", "**Data:** https://overthecap.com/positional-spending/ \n", "**NFL Logos:** https://raw.githubusercontent.com/statsbylopez/BlogPosts/master/nfl_teamlogos.csv \n", "<br><br>\n", "This notebook scrapes the data from overthecap.com and stored a parquet file to be used for analysis. \n", "NFL Logos are also pulled and resized for use in visualizations." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "import os\n", "import urllib.request\n", "import requests\n", "from bs4 import BeautifulSoup\n", "from PIL import Image\n", "\n", "import janitor\n", "\n", "from sportsipy.nfl.teams import Teams" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<Response [200]>" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spending_url = 'https://overthecap.com/positional-spending/'\n", "page = requests.get(spending_url)\n", "page" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "soup = BeautifulSoup(page.content, 'html.parser')\n", "position_spending_content = soup.find_all('table')\n", "dfs = pd.read_html(str(position_spending_content))" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(383, 14)" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "year = 2013\n", "for df in dfs:\n", " df['year'] = year\n", " year += 1\n", "nfl_df = pd.concat(dfs)\n", "nfl_df.shape" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(288, 14)" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nfl_df = nfl_df.clean_names()\\\n", " .filter_on('year <= 2021')\n", "nfl_df.shape" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\jaime.avendano\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:1717: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " isetter(loc, v)\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>team</th>\n", " <th>qb</th>\n", " <th>rb</th>\n", " <th>wr</th>\n", " <th>te</th>\n", " <th>ol</th>\n", " <th>offense</th>\n", " <th>idl</th>\n", " <th>edge</th>\n", " <th>lb</th>\n", " <th>s</th>\n", " <th>cb</th>\n", " <th>defense</th>\n", " <th>year</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Eagles</td>\n", " <td>13385137</td>\n", " <td>10203112</td>\n", " <td>19241989</td>\n", " <td>5509036</td>\n", " <td>25664899</td>\n", " <td>74004173</td>\n", " <td>5264666</td>\n", " <td>10241101</td>\n", " <td>10004817</td>\n", " <td>6920158</td>\n", " <td>6032738</td>\n", " <td>38463480</td>\n", " <td>2013</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Seahawks</td>\n", " <td>1557085</td>\n", " <td>10799653</td>\n", " <td>16831423</td>\n", " <td>12778788</td>\n", " <td>27955261</td>\n", " <td>69922210</td>\n", " <td>7701509</td>\n", " <td>25013832</td>\n", " <td>7377232</td>\n", " <td>8579619</td>\n", " <td>3507877</td>\n", " <td>52180069</td>\n", " <td>2013</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Titans</td>\n", " <td>6336958</td>\n", " <td>15376098</td>\n", " <td>12686896</td>\n", " <td>6979500</td>\n", " <td>26721984</td>\n", " <td>68101436</td>\n", " <td>6496528</td>\n", " <td>8776293</td>\n", " <td>5740835</td>\n", " <td>10777066</td>\n", " <td>9602477</td>\n", " <td>41393199</td>\n", " <td>2013</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Broncos</td>\n", " <td>18716295</td>\n", " <td>5070632</td>\n", " <td>10120554</td>\n", " <td>8013902</td>\n", " <td>24318758</td>\n", " <td>66240141</td>\n", " <td>6880138</td>\n", " <td>5313554</td>\n", " <td>12951882</td>\n", " <td>5894346</td>\n", " <td>17559225</td>\n", " <td>48599145</td>\n", " <td>2013</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Giants</td>\n", " <td>21998400</td>\n", " <td>5036739</td>\n", " <td>8677626</td>\n", " <td>2998913</td>\n", " <td>24235900</td>\n", " <td>62947578</td>\n", " <td>5118995</td>\n", " <td>9523813</td>\n", " <td>7916847</td>\n", " <td>12704990</td>\n", " <td>11202110</td>\n", " <td>46466755</td>\n", " <td>2013</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " team qb rb wr te ol offense \\\n", "0 Eagles 13385137 10203112 19241989 5509036 25664899 74004173 \n", "1 Seahawks 1557085 10799653 16831423 12778788 27955261 69922210 \n", "2 Titans 6336958 15376098 12686896 6979500 26721984 68101436 \n", "3 Broncos 18716295 5070632 10120554 8013902 24318758 66240141 \n", "4 Giants 21998400 5036739 8677626 2998913 24235900 62947578 \n", "\n", " idl edge lb s cb defense year \n", "0 5264666 10241101 10004817 6920158 6032738 38463480 2013 \n", "1 7701509 25013832 7377232 8579619 3507877 52180069 2013 \n", "2 6496528 8776293 5740835 10777066 9602477 41393199 2013 \n", "3 6880138 5313554 12951882 5894346 17559225 48599145 2013 \n", "4 5118995 9523813 7916847 12704990 11202110 46466755 2013 " ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nfl_df.loc[:, nfl_df.columns[1:-1]] = nfl_df[nfl_df.columns[1:-1]].replace('[\\$,]', '', regex=True).astype(int)\n", "nfl_df.head()" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>year</th>\n", " <th>cap</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2013</td>\n", " <td>123000000.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2014</td>\n", " <td>133000000.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2015</td>\n", " <td>143280000.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2016</td>\n", " <td>155270000.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>2017</td>\n", " <td>167000000.0</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>2018</td>\n", " <td>177200000.0</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>2019</td>\n", " <td>188200000.0</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>2020</td>\n", " <td>198200000.0</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>2021</td>\n", " <td>182500000.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " year cap\n", "0 2013 123000000.0\n", "1 2014 133000000.0\n", "2 2015 143280000.0\n", "3 2016 155270000.0\n", "4 2017 167000000.0\n", "5 2018 177200000.0\n", "6 2019 188200000.0\n", "7 2020 198200000.0\n", "8 2021 182500000.0" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Data from https://en.wikipedia.org/wiki/Salary_cap\n", "nfl_salary_cap = [[2013, 123], [2014, 133], [2015, 143.28], [2016, 155.27], [2017, 167], [2018, 177.2], [2019, 188.2], [2020, 198.2], [2021, 182.5]]\n", "cap_df = pd.DataFrame(nfl_salary_cap, columns=['year', 'cap'])\n", "cap_df.cap = cap_df.cap * 1e6\n", "cap_df" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "Int64Index: 288 entries, 0 to 31\n", "Data columns (total 15 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 team 288 non-null category\n", " 1 qb 288 non-null int32 \n", " 2 rb 288 non-null int32 \n", " 3 wr 288 non-null int32 \n", " 4 te 288 non-null int32 \n", " 5 ol 288 non-null int32 \n", " 6 offense 288 non-null int32 \n", " 7 idl 288 non-null int32 \n", " 8 edge 288 non-null int32 \n", " 9 lb 288 non-null int32 \n", " 10 s 288 non-null int32 \n", " 11 cb 288 non-null int32 \n", " 12 defense 288 non-null int32 \n", " 13 year 288 non-null int64 \n", " 14 cap 288 non-null float64 \n", "dtypes: category(1), float64(1), int32(12), int64(1)\n", "memory usage: 22.0 KB\n" ] } ], "source": [ "nfl_df = nfl_df.join(cap_df.set_index('year'), on='year')\n", "nfl_df.team = nfl_df.team.astype('category')\n", "nfl_df.info()" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "nfl_df.to_parquet('teams_spending_df.parquet')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pulling NFL Logos\n", "Code based on: https://gist.github.com/Deryck97/dff8d33e9f841568201a2a0d5519ac5e" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>team</th>\n", " <th>team_code</th>\n", " <th>url</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Arizona Cardinals</td>\n", " <td>ARI</td>\n", " <td>https://upload.wikimedia.org/wikipedia/en/thum...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Atlanta Falcons</td>\n", " <td>ATL</td>\n", " <td>https://upload.wikimedia.org/wikipedia/en/thum...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Baltimore Ravens</td>\n", " <td>BAL</td>\n", " <td>https://upload.wikimedia.org/wikipedia/en/thum...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Buffalo Bills</td>\n", " <td>BUF</td>\n", " <td>https://upload.wikimedia.org/wikipedia/en/thum...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Carolina Panthers</td>\n", " <td>CAR</td>\n", " <td>https://upload.wikimedia.org/wikipedia/en/thum...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " team team_code \\\n", "0 Arizona Cardinals ARI \n", "1 Atlanta Falcons ATL \n", "2 Baltimore Ravens BAL \n", "3 Buffalo Bills BUF \n", "4 Carolina Panthers CAR \n", "\n", " url \n", "0 https://upload.wikimedia.org/wikipedia/en/thum... \n", "1 https://upload.wikimedia.org/wikipedia/en/thum... \n", "2 https://upload.wikimedia.org/wikipedia/en/thum... \n", "3 https://upload.wikimedia.org/wikipedia/en/thum... \n", "4 https://upload.wikimedia.org/wikipedia/en/thum... " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "teams_logos_df = pd.read_csv('https://raw.githubusercontent.com/statsbylopez/BlogPosts/master/nfl_teamlogos.csv')\n", "teams_logos_df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(32, 4)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "teams_logos_df.loc[:, 'team'] = teams_logos_df.team.str.split().str[-1]\n", "teams_logos_df.loc[teams_logos_df.team == 'Team', 'team'] = 'Washington'\n", "teams_logos_df['path'] = os.getcwd() + '\\\\Logos\\\\' + teams_logos_df.team + '.png'\n", "teams_logos_df = teams_logos_df.drop_duplicates(subset='team')\n", "teams_logos_df.shape" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [], "source": [ "for i in range(0,len(teams_logos_df)):\n", " img_path = teams_logos_df.path.iloc[i]\n", " urllib.request.urlretrieve(teams_logos_df.url.iloc[i], img_path)\n", " img = Image.open(img_path)\n", " img.thumbnail((100, 100), Image.ANTIALIAS)\n", " img.save(img_path)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "teams_logos_df.to_parquet('teams_logos_df.parquet')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## NFL Team Stats" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[2013, 'Broncos', 0.813, 606, 399],\n", " [2013, 'Bears', 0.5, 445, 478],\n", " [2013, 'Patriots', 0.75, 444, 338],\n", " [2013, 'Eagles', 0.625, 442, 382],\n", " [2013, 'Cowboys', 0.5, 439, 432]]" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "team_info = []\n", "for year in range(2013, 2021):\n", " for team in Teams(year):\n", " team_info.append([year, team.name.split()[-1], team.win_percentage, team.points_for, team.points_against])\n", "team_info[:5]" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>year</th>\n", " <th>team</th>\n", " <th>win_pct</th>\n", " <th>points_for</th>\n", " <th>points_against</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2013</td>\n", " <td>Broncos</td>\n", " <td>0.813</td>\n", " <td>606</td>\n", " <td>399</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2013</td>\n", " <td>Bears</td>\n", " <td>0.500</td>\n", " <td>445</td>\n", " <td>478</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2013</td>\n", " <td>Patriots</td>\n", " <td>0.750</td>\n", " <td>444</td>\n", " <td>338</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2013</td>\n", " <td>Eagles</td>\n", " <td>0.625</td>\n", " <td>442</td>\n", " <td>382</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>2013</td>\n", " <td>Cowboys</td>\n", " <td>0.500</td>\n", " <td>439</td>\n", " <td>432</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " year team win_pct points_for points_against\n", "0 2013 Broncos 0.813 606 399\n", "1 2013 Bears 0.500 445 478\n", "2 2013 Patriots 0.750 444 338\n", "3 2013 Eagles 0.625 442 382\n", "4 2013 Cowboys 0.500 439 432" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "team_detail_df = pd.DataFrame(team_info, columns=['year', 'team', 'win_pct', 'points_for', 'points_against'])\n", "team_detail_df.loc[team_detail_df.team == 'Redskins', 'team'] = 'Washington'\n", "team_detail_df.loc[team_detail_df.team == 'Team', 'team'] = 'Washington'\n", "team_detail_df.head()" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(256, 5)" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "team_detail_df.to_parquet('teams_detail_df.parquet')\n", "team_detail_df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }