{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Positional Spending Data\n", "**Name:** Jaime AvendaƱo \n", "**Date:** 5/20/2021 \n", "**Data:** https://overthecap.com/positional-spending/ \n", "**NFL Logos:** https://raw.githubusercontent.com/statsbylopez/BlogPosts/master/nfl_teamlogos.csv \n", "

\n", "This notebook scrapes the data from overthecap.com and stored a parquet file to be used for analysis. \n", "NFL Logos are also pulled and resized for use in visualizations." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "import os\n", "import urllib.request\n", "import requests\n", "from bs4 import BeautifulSoup\n", "from PIL import Image\n", "\n", "import janitor\n", "\n", "from sportsipy.nfl.teams import Teams" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spending_url = 'https://overthecap.com/positional-spending/'\n", "page = requests.get(spending_url)\n", "page" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "soup = BeautifulSoup(page.content, 'html.parser')\n", "position_spending_content = soup.find_all('table')\n", "dfs = pd.read_html(str(position_spending_content))" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(383, 14)" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "year = 2013\n", "for df in dfs:\n", " df['year'] = year\n", " year += 1\n", "nfl_df = pd.concat(dfs)\n", "nfl_df.shape" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(288, 14)" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nfl_df = nfl_df.clean_names()\\\n", " .filter_on('year <= 2021')\n", "nfl_df.shape" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\jaime.avendano\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:1717: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " isetter(loc, v)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
teamqbrbwrteoloffenseidledgelbscbdefenseyear
0Eagles133851371020311219241989550903625664899740041735264666102411011000481769201586032738384634802013
1Seahawks15570851079965316831423127787882795526169922210770150925013832737723285796193507877521800692013
2Titans6336958153760981268689669795002672198468101436649652887762935740835107770669602477413931992013
3Broncos18716295507063210120554801390224318758662401416880138531355412951882589434617559225485991452013
4Giants2199840050367398677626299891324235900629475785118995952381379168471270499011202110464667552013
\n", "
" ], "text/plain": [ " team qb rb wr te ol offense \\\n", "0 Eagles 13385137 10203112 19241989 5509036 25664899 74004173 \n", "1 Seahawks 1557085 10799653 16831423 12778788 27955261 69922210 \n", "2 Titans 6336958 15376098 12686896 6979500 26721984 68101436 \n", "3 Broncos 18716295 5070632 10120554 8013902 24318758 66240141 \n", "4 Giants 21998400 5036739 8677626 2998913 24235900 62947578 \n", "\n", " idl edge lb s cb defense year \n", "0 5264666 10241101 10004817 6920158 6032738 38463480 2013 \n", "1 7701509 25013832 7377232 8579619 3507877 52180069 2013 \n", "2 6496528 8776293 5740835 10777066 9602477 41393199 2013 \n", "3 6880138 5313554 12951882 5894346 17559225 48599145 2013 \n", "4 5118995 9523813 7916847 12704990 11202110 46466755 2013 " ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nfl_df.loc[:, nfl_df.columns[1:-1]] = nfl_df[nfl_df.columns[1:-1]].replace('[\\$,]', '', regex=True).astype(int)\n", "nfl_df.head()" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearcap
02013123000000.0
12014133000000.0
22015143280000.0
32016155270000.0
42017167000000.0
52018177200000.0
62019188200000.0
72020198200000.0
82021182500000.0
\n", "
" ], "text/plain": [ " year cap\n", "0 2013 123000000.0\n", "1 2014 133000000.0\n", "2 2015 143280000.0\n", "3 2016 155270000.0\n", "4 2017 167000000.0\n", "5 2018 177200000.0\n", "6 2019 188200000.0\n", "7 2020 198200000.0\n", "8 2021 182500000.0" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Data from https://en.wikipedia.org/wiki/Salary_cap\n", "nfl_salary_cap = [[2013, 123], [2014, 133], [2015, 143.28], [2016, 155.27], [2017, 167], [2018, 177.2], [2019, 188.2], [2020, 198.2], [2021, 182.5]]\n", "cap_df = pd.DataFrame(nfl_salary_cap, columns=['year', 'cap'])\n", "cap_df.cap = cap_df.cap * 1e6\n", "cap_df" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 288 entries, 0 to 31\n", "Data columns (total 15 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 team 288 non-null category\n", " 1 qb 288 non-null int32 \n", " 2 rb 288 non-null int32 \n", " 3 wr 288 non-null int32 \n", " 4 te 288 non-null int32 \n", " 5 ol 288 non-null int32 \n", " 6 offense 288 non-null int32 \n", " 7 idl 288 non-null int32 \n", " 8 edge 288 non-null int32 \n", " 9 lb 288 non-null int32 \n", " 10 s 288 non-null int32 \n", " 11 cb 288 non-null int32 \n", " 12 defense 288 non-null int32 \n", " 13 year 288 non-null int64 \n", " 14 cap 288 non-null float64 \n", "dtypes: category(1), float64(1), int32(12), int64(1)\n", "memory usage: 22.0 KB\n" ] } ], "source": [ "nfl_df = nfl_df.join(cap_df.set_index('year'), on='year')\n", "nfl_df.team = nfl_df.team.astype('category')\n", "nfl_df.info()" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "nfl_df.to_parquet('teams_spending_df.parquet')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pulling NFL Logos\n", "Code based on: https://gist.github.com/Deryck97/dff8d33e9f841568201a2a0d5519ac5e" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
teamteam_codeurl
0Arizona CardinalsARIhttps://upload.wikimedia.org/wikipedia/en/thum...
1Atlanta FalconsATLhttps://upload.wikimedia.org/wikipedia/en/thum...
2Baltimore RavensBALhttps://upload.wikimedia.org/wikipedia/en/thum...
3Buffalo BillsBUFhttps://upload.wikimedia.org/wikipedia/en/thum...
4Carolina PanthersCARhttps://upload.wikimedia.org/wikipedia/en/thum...
\n", "
" ], "text/plain": [ " team team_code \\\n", "0 Arizona Cardinals ARI \n", "1 Atlanta Falcons ATL \n", "2 Baltimore Ravens BAL \n", "3 Buffalo Bills BUF \n", "4 Carolina Panthers CAR \n", "\n", " url \n", "0 https://upload.wikimedia.org/wikipedia/en/thum... \n", "1 https://upload.wikimedia.org/wikipedia/en/thum... \n", "2 https://upload.wikimedia.org/wikipedia/en/thum... \n", "3 https://upload.wikimedia.org/wikipedia/en/thum... \n", "4 https://upload.wikimedia.org/wikipedia/en/thum... " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "teams_logos_df = pd.read_csv('https://raw.githubusercontent.com/statsbylopez/BlogPosts/master/nfl_teamlogos.csv')\n", "teams_logos_df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(32, 4)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "teams_logos_df.loc[:, 'team'] = teams_logos_df.team.str.split().str[-1]\n", "teams_logos_df.loc[teams_logos_df.team == 'Team', 'team'] = 'Washington'\n", "teams_logos_df['path'] = os.getcwd() + '\\\\Logos\\\\' + teams_logos_df.team + '.png'\n", "teams_logos_df = teams_logos_df.drop_duplicates(subset='team')\n", "teams_logos_df.shape" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [], "source": [ "for i in range(0,len(teams_logos_df)):\n", " img_path = teams_logos_df.path.iloc[i]\n", " urllib.request.urlretrieve(teams_logos_df.url.iloc[i], img_path)\n", " img = Image.open(img_path)\n", " img.thumbnail((100, 100), Image.ANTIALIAS)\n", " img.save(img_path)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "teams_logos_df.to_parquet('teams_logos_df.parquet')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## NFL Team Stats" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[2013, 'Broncos', 0.813, 606, 399],\n", " [2013, 'Bears', 0.5, 445, 478],\n", " [2013, 'Patriots', 0.75, 444, 338],\n", " [2013, 'Eagles', 0.625, 442, 382],\n", " [2013, 'Cowboys', 0.5, 439, 432]]" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "team_info = []\n", "for year in range(2013, 2021):\n", " for team in Teams(year):\n", " team_info.append([year, team.name.split()[-1], team.win_percentage, team.points_for, team.points_against])\n", "team_info[:5]" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearteamwin_pctpoints_forpoints_against
02013Broncos0.813606399
12013Bears0.500445478
22013Patriots0.750444338
32013Eagles0.625442382
42013Cowboys0.500439432
\n", "
" ], "text/plain": [ " year team win_pct points_for points_against\n", "0 2013 Broncos 0.813 606 399\n", "1 2013 Bears 0.500 445 478\n", "2 2013 Patriots 0.750 444 338\n", "3 2013 Eagles 0.625 442 382\n", "4 2013 Cowboys 0.500 439 432" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "team_detail_df = pd.DataFrame(team_info, columns=['year', 'team', 'win_pct', 'points_for', 'points_against'])\n", "team_detail_df.loc[team_detail_df.team == 'Redskins', 'team'] = 'Washington'\n", "team_detail_df.loc[team_detail_df.team == 'Team', 'team'] = 'Washington'\n", "team_detail_df.head()" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(256, 5)" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "team_detail_df.to_parquet('teams_detail_df.parquet')\n", "team_detail_df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }