{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert CSV/DataFrames to PostgreSQL tables\n", "We use a single database \"cbb\" to store various tables" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import glob\n", "import datetime as dt\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from watchcbb.sql import SQLEngine" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "sql = SQLEngine('cbb')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Connect to 'cbb' database" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Start with simple teams.csv" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>team_id</th>\n", " <th>display_name</th>\n", " <th>flair_name</th>\n", " <th>conference</th>\n", " <th>location</th>\n", " <th>year_start</th>\n", " <th>year_end</th>\n", " <th>color</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>abilene-christian</td>\n", " <td>Abilene Christian</td>\n", " <td>Abilene Christian Wildcats</td>\n", " <td>Southland</td>\n", " <td>Abilene, Texas</td>\n", " <td>1971</td>\n", " <td>2020</td>\n", " <td>#4e2583</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>air-force</td>\n", " <td>Air Force</td>\n", " <td>Air Force Falcons</td>\n", " <td>MWC</td>\n", " <td>USAF Academy, Colorado</td>\n", " <td>1958</td>\n", " <td>2020</td>\n", " <td>#0061aa</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>akron</td>\n", " <td>Akron</td>\n", " <td>Akron Zips</td>\n", " <td>MAC</td>\n", " <td>Akron, Ohio</td>\n", " <td>1902</td>\n", " <td>2020</td>\n", " <td>#a89968</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>alabama-am</td>\n", " <td>Alabama A&M</td>\n", " <td>NaN</td>\n", " <td>SWAC</td>\n", " <td>Normal, Alabama</td>\n", " <td>2000</td>\n", " <td>2020</td>\n", " <td>#661012</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>alabama</td>\n", " <td>Alabama</td>\n", " <td>Alabama Crimson Tide</td>\n", " <td>SEC</td>\n", " <td>Tuscaloosa, Alabama</td>\n", " <td>1913</td>\n", " <td>2020</td>\n", " <td>#aa1c37</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " team_id display_name flair_name \\\n", "0 abilene-christian Abilene Christian Abilene Christian Wildcats \n", "1 air-force Air Force Air Force Falcons \n", "2 akron Akron Akron Zips \n", "3 alabama-am Alabama A&M NaN \n", "4 alabama Alabama Alabama Crimson Tide \n", "\n", " conference location year_start year_end color \n", "0 Southland Abilene, Texas 1971 2020 #4e2583 \n", "1 MWC USAF Academy, Colorado 1958 2020 #0061aa \n", "2 MAC Akron, Ohio 1902 2020 #a89968 \n", "3 SWAC Normal, Alabama 2000 2020 #661012 \n", "4 SEC Tuscaloosa, Alabama 1913 2020 #aa1c37 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('../data/teams.csv')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 357 entries, 0 to 356\n", "Data columns (total 8 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 team_id 357 non-null object\n", " 1 display_name 357 non-null object\n", " 2 flair_name 295 non-null object\n", " 3 conference 357 non-null object\n", " 4 location 357 non-null object\n", " 5 year_start 357 non-null int64 \n", " 6 year_end 357 non-null int64 \n", " 7 color 354 non-null object\n", "dtypes: int64(2), object(6)\n", "memory usage: 22.4+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "sql.df_to_sql(df, 'teams', if_exists='replace')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### AP Rankings" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'list'>\n" ] }, { "data": { "text/plain": [ "160 2018-12-17\n", "161 2018-12-24\n", "162 2018-12-31\n", "163 2019-01-07\n", "164 2019-01-14\n", "165 2019-01-21\n", "166 2019-01-28\n", "167 2019-02-04\n", "168 2019-02-11\n", "169 2019-02-18\n", "Name: date, dtype: object" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ap = pd.read_csv('../data/ap_rankings.csv')\n", "# they are read in as strings, so convert to lists\n", "for i in range(1,26):\n", " df_ap['r'+str(i)] = df_ap['r'+str(i)].apply(eval)\n", "df_ap.date = df_ap.date.apply(lambda x:dt.date(*[int(x) for x in x.split('-')]))\n", "print(type(df_ap.r1[0]))\n", "df_ap.date.iloc[160:170]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 194 entries, 0 to 193\n", "Data columns (total 26 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 date 194 non-null object\n", " 1 r1 194 non-null object\n", " 2 r2 194 non-null object\n", " 3 r3 194 non-null object\n", " 4 r4 194 non-null object\n", " 5 r5 194 non-null object\n", " 6 r6 194 non-null object\n", " 7 r7 194 non-null object\n", " 8 r8 194 non-null object\n", " 9 r9 194 non-null object\n", " 10 r10 194 non-null object\n", " 11 r11 194 non-null object\n", " 12 r12 194 non-null object\n", " 13 r13 194 non-null object\n", " 14 r14 194 non-null object\n", " 15 r15 194 non-null object\n", " 16 r16 194 non-null object\n", " 17 r17 194 non-null object\n", " 18 r18 194 non-null object\n", " 19 r19 194 non-null object\n", " 20 r20 194 non-null object\n", " 21 r21 194 non-null object\n", " 22 r22 194 non-null object\n", " 23 r23 194 non-null object\n", " 24 r24 194 non-null object\n", " 25 r25 194 non-null object\n", "dtypes: object(26)\n", "memory usage: 39.5+ KB\n" ] } ], "source": [ "df_ap.info()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "sql.df_to_sql(df_ap, 'ap_rankings', if_exists='replace')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Game-by-game data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape: (54760, 39)\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Season</th>\n", " <th>Date</th>\n", " <th>gid</th>\n", " <th>Type</th>\n", " <th>WTeamID</th>\n", " <th>WScore</th>\n", " <th>LTeamID</th>\n", " <th>LScore</th>\n", " <th>Wrank</th>\n", " <th>Lrank</th>\n", " <th>...</th>\n", " <th>LFTM</th>\n", " <th>LFTA</th>\n", " <th>LOR</th>\n", " <th>LDR</th>\n", " <th>LAst</th>\n", " <th>LTO</th>\n", " <th>LStl</th>\n", " <th>LBlk</th>\n", " <th>LPF</th>\n", " <th>poss</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2011</td>\n", " <td>2010-11-08</td>\n", " <td>2010-11-08_california-irvine_illinois</td>\n", " <td>RG</td>\n", " <td>illinois</td>\n", " <td>79</td>\n", " <td>california-irvine</td>\n", " <td>65</td>\n", " <td>13</td>\n", " <td>-1</td>\n", " <td>...</td>\n", " <td>14</td>\n", " <td>22</td>\n", " <td>11</td>\n", " <td>26</td>\n", " <td>12</td>\n", " <td>14</td>\n", " <td>7</td>\n", " <td>1</td>\n", " <td>21</td>\n", " <td>74.90</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2011</td>\n", " <td>2010-11-08</td>\n", " <td>2010-11-08_pittsburgh_rhode-island</td>\n", " <td>RG</td>\n", " <td>pittsburgh</td>\n", " <td>83</td>\n", " <td>rhode-island</td>\n", " <td>75</td>\n", " <td>5</td>\n", " <td>-1</td>\n", " <td>...</td>\n", " <td>13</td>\n", " <td>16</td>\n", " <td>9</td>\n", " <td>20</td>\n", " <td>16</td>\n", " <td>16</td>\n", " <td>7</td>\n", " <td>6</td>\n", " <td>25</td>\n", " <td>75.62</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2011</td>\n", " <td>2010-11-08</td>\n", " <td>2010-11-08_maryland_seattle</td>\n", " <td>RG</td>\n", " <td>maryland</td>\n", " <td>105</td>\n", " <td>seattle</td>\n", " <td>76</td>\n", " <td>-1</td>\n", " <td>-1</td>\n", " <td>...</td>\n", " <td>17</td>\n", " <td>22</td>\n", " <td>13</td>\n", " <td>16</td>\n", " <td>11</td>\n", " <td>27</td>\n", " <td>18</td>\n", " <td>1</td>\n", " <td>31</td>\n", " <td>98.36</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2011</td>\n", " <td>2010-11-08</td>\n", " <td>2010-11-08_navy_texas</td>\n", " <td>RG</td>\n", " <td>texas</td>\n", " <td>83</td>\n", " <td>navy</td>\n", " <td>52</td>\n", " <td>-1</td>\n", " <td>-1</td>\n", " <td>...</td>\n", " <td>10</td>\n", " <td>18</td>\n", " <td>5</td>\n", " <td>21</td>\n", " <td>8</td>\n", " <td>16</td>\n", " <td>5</td>\n", " <td>5</td>\n", " <td>24</td>\n", " <td>80.44</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>2011</td>\n", " <td>2010-11-10</td>\n", " <td>2010-11-10_college-of-charleston_maryland</td>\n", " <td>RG</td>\n", " <td>maryland</td>\n", " <td>75</td>\n", " <td>college-of-charleston</td>\n", " <td>74</td>\n", " <td>-1</td>\n", " <td>-1</td>\n", " <td>...</td>\n", " <td>16</td>\n", " <td>19</td>\n", " <td>10</td>\n", " <td>25</td>\n", " <td>12</td>\n", " <td>22</td>\n", " <td>8</td>\n", " <td>8</td>\n", " <td>19</td>\n", " <td>77.64</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>2011</td>\n", " <td>2010-11-10</td>\n", " <td>2010-11-10_illinois-chicago_pittsburgh</td>\n", " <td>RG</td>\n", " <td>pittsburgh</td>\n", " <td>97</td>\n", " <td>illinois-chicago</td>\n", " <td>54</td>\n", " <td>5</td>\n", " <td>-1</td>\n", " <td>...</td>\n", " <td>7</td>\n", " <td>9</td>\n", " <td>5</td>\n", " <td>15</td>\n", " <td>9</td>\n", " <td>11</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>18</td>\n", " <td>67.88</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>2011</td>\n", " <td>2010-11-10</td>\n", " <td>2010-11-10_illinois_toledo</td>\n", " <td>RG</td>\n", " <td>illinois</td>\n", " <td>84</td>\n", " <td>toledo</td>\n", " <td>45</td>\n", " <td>13</td>\n", " <td>-1</td>\n", " <td>...</td>\n", " <td>10</td>\n", " <td>19</td>\n", " <td>3</td>\n", " <td>16</td>\n", " <td>9</td>\n", " <td>23</td>\n", " <td>6</td>\n", " <td>0</td>\n", " <td>14</td>\n", " <td>72.00</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>2011</td>\n", " <td>2010-11-10</td>\n", " <td>2010-11-10_louisiana-tech_texas</td>\n", " <td>RG</td>\n", " <td>texas</td>\n", " <td>89</td>\n", " <td>louisiana-tech</td>\n", " <td>58</td>\n", " <td>-1</td>\n", " <td>-1</td>\n", " <td>...</td>\n", " <td>13</td>\n", " <td>20</td>\n", " <td>13</td>\n", " <td>25</td>\n", " <td>4</td>\n", " <td>22</td>\n", " <td>6</td>\n", " <td>5</td>\n", " <td>21</td>\n", " <td>83.74</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>2011</td>\n", " <td>2010-11-12</td>\n", " <td>2010-11-12_maryland-eastern-shore_tulane</td>\n", " <td>RG</td>\n", " <td>tulane</td>\n", " <td>91</td>\n", " <td>maryland-eastern-shore</td>\n", " <td>62</td>\n", " <td>-1</td>\n", " <td>-1</td>\n", " <td>...</td>\n", " <td>19</td>\n", " <td>26</td>\n", " <td>9</td>\n", " <td>16</td>\n", " <td>11</td>\n", " <td>21</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>28</td>\n", " <td>78.04</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>2011</td>\n", " <td>2010-11-12</td>\n", " <td>2010-11-12_marquette_prairie-view</td>\n", " <td>RG</td>\n", " <td>marquette</td>\n", " <td>97</td>\n", " <td>prairie-view</td>\n", " <td>58</td>\n", " <td>-1</td>\n", " <td>-1</td>\n", " <td>...</td>\n", " <td>7</td>\n", " <td>11</td>\n", " <td>3</td>\n", " <td>13</td>\n", " <td>12</td>\n", " <td>20</td>\n", " <td>11</td>\n", " <td>3</td>\n", " <td>27</td>\n", " <td>79.12</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>10 rows × 39 columns</p>\n", "</div>" ], "text/plain": [ " Season Date gid Type \\\n", "0 2011 2010-11-08 2010-11-08_california-irvine_illinois RG \n", "1 2011 2010-11-08 2010-11-08_pittsburgh_rhode-island RG \n", "2 2011 2010-11-08 2010-11-08_maryland_seattle RG \n", "3 2011 2010-11-08 2010-11-08_navy_texas RG \n", "4 2011 2010-11-10 2010-11-10_college-of-charleston_maryland RG \n", "5 2011 2010-11-10 2010-11-10_illinois-chicago_pittsburgh RG \n", "6 2011 2010-11-10 2010-11-10_illinois_toledo RG \n", "7 2011 2010-11-10 2010-11-10_louisiana-tech_texas RG \n", "8 2011 2010-11-12 2010-11-12_maryland-eastern-shore_tulane RG \n", "9 2011 2010-11-12 2010-11-12_marquette_prairie-view RG \n", "\n", " WTeamID WScore LTeamID LScore Wrank Lrank ... LFTM \\\n", "0 illinois 79 california-irvine 65 13 -1 ... 14 \n", "1 pittsburgh 83 rhode-island 75 5 -1 ... 13 \n", "2 maryland 105 seattle 76 -1 -1 ... 17 \n", "3 texas 83 navy 52 -1 -1 ... 10 \n", "4 maryland 75 college-of-charleston 74 -1 -1 ... 16 \n", "5 pittsburgh 97 illinois-chicago 54 5 -1 ... 7 \n", "6 illinois 84 toledo 45 13 -1 ... 10 \n", "7 texas 89 louisiana-tech 58 -1 -1 ... 13 \n", "8 tulane 91 maryland-eastern-shore 62 -1 -1 ... 19 \n", "9 marquette 97 prairie-view 58 -1 -1 ... 7 \n", "\n", " LFTA LOR LDR LAst LTO LStl LBlk LPF poss \n", "0 22 11 26 12 14 7 1 21 74.90 \n", "1 16 9 20 16 16 7 6 25 75.62 \n", "2 22 13 16 11 27 18 1 31 98.36 \n", "3 18 5 21 8 16 5 5 24 80.44 \n", "4 19 10 25 12 22 8 8 19 77.64 \n", "5 9 5 15 9 11 0 3 18 67.88 \n", "6 19 3 16 9 23 6 0 14 72.00 \n", "7 20 13 25 4 22 6 5 21 83.74 \n", "8 26 9 16 11 21 4 2 28 78.04 \n", "9 11 3 13 12 20 11 3 27 79.12 \n", "\n", "[10 rows x 39 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_gid(row):\n", " tid1, tid2 = sorted((row.WTeamID, row.LTeamID))\n", " return '{0}_{1}_{2}'.format(row.Date, tid1, tid2)\n", "\n", "def get_ap_ranks(row):\n", " date = row.Date\n", " wid, lid = row.WTeamID, row.LTeamID\n", " idx = np.argmax(date < df_ap.date) - 1\n", " r1, r2 = -1, -1\n", " ranks = df_ap.iloc[idx].values[1:].tolist()\n", " for irank,ts in enumerate(ranks):\n", " if wid in ts:\n", " r1 = irank+1\n", " if lid in ts:\n", " r2 = irank+1\n", " return r1,r2\n", "\n", "dfs = []\n", "for fname in glob.glob(\"../data/game_data/*.csv\"):\n", " dfs.append(pd.read_csv(fname))\n", "df = pd.concat(dfs)\n", "df.Date = df.Date.apply(lambda x:dt.date(*[int(x) for x in x.split('-')]))\n", "df = df.sort_values(\"Date\").reset_index(drop=True)\n", "df[\"poss\"] = 0.5*(df[\"WFGA\"] + 0.44*df[\"WFTA\"] - df[\"WOR\"] + df[\"WTO\"] + df[\"LFGA\"] + 0.44*df[\"LFTA\"] - df[\"LOR\"] + df[\"LTO\"])\n", "df.insert(2, \"gid\", df.apply(get_gid, axis=1))\n", "apranks = df.apply(get_ap_ranks, axis=1).values\n", "df.insert(8, \"Wrank\", [x[0] for x in apranks])\n", "df.insert(9, \"Lrank\", [x[1] for x in apranks])\n", "print(\"Shape:\",df.shape)\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 54760 entries, 0 to 54759\n", "Data columns (total 39 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Season 54760 non-null int64 \n", " 1 Date 54760 non-null object \n", " 2 gid 54760 non-null object \n", " 3 Type 54760 non-null object \n", " 4 WTeamID 54760 non-null object \n", " 5 WScore 54760 non-null int64 \n", " 6 LTeamID 54760 non-null object \n", " 7 LScore 54760 non-null int64 \n", " 8 Wrank 54760 non-null int64 \n", " 9 Lrank 54760 non-null int64 \n", " 10 WLoc 54760 non-null object \n", " 11 NumOT 54760 non-null int64 \n", " 12 WFGM 54760 non-null int64 \n", " 13 WFGA 54760 non-null int64 \n", " 14 WFGM3 54760 non-null int64 \n", " 15 WFGA3 54760 non-null int64 \n", " 16 WFTM 54760 non-null int64 \n", " 17 WFTA 54760 non-null int64 \n", " 18 WOR 54760 non-null int64 \n", " 19 WDR 54760 non-null int64 \n", " 20 WAst 54760 non-null int64 \n", " 21 WTO 54760 non-null int64 \n", " 22 WStl 54760 non-null int64 \n", " 23 WBlk 54760 non-null int64 \n", " 24 WPF 54760 non-null int64 \n", " 25 LFGM 54760 non-null int64 \n", " 26 LFGA 54760 non-null int64 \n", " 27 LFGM3 54760 non-null int64 \n", " 28 LFGA3 54760 non-null int64 \n", " 29 LFTM 54760 non-null int64 \n", " 30 LFTA 54760 non-null int64 \n", " 31 LOR 54760 non-null int64 \n", " 32 LDR 54760 non-null int64 \n", " 33 LAst 54760 non-null int64 \n", " 34 LTO 54760 non-null int64 \n", " 35 LStl 54760 non-null int64 \n", " 36 LBlk 54760 non-null int64 \n", " 37 LPF 54760 non-null int64 \n", " 38 poss 54760 non-null float64\n", "dtypes: float64(1), int64(32), object(6)\n", "memory usage: 16.3+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "sql.df_to_sql(df, 'game_data', if_exists='replace')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reddit submissions" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>date</th>\n", " <th>timestamp</th>\n", " <th>id</th>\n", " <th>gid</th>\n", " <th>is_postgame</th>\n", " <th>ups</th>\n", " <th>num_comments</th>\n", " <th>title</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>157</th>\n", " <td>2017-11-10</td>\n", " <td>2017-11-11 08:58:11</td>\n", " <td>7c718s</td>\n", " <td>2017-11-10_alabama-birmingham_jacksonville</td>\n", " <td>True</td>\n", " <td>9</td>\n", " <td>0</td>\n", " <td>[Post Game Thread] UAB defeats Jacksonville, 9...</td>\n", " </tr>\n", " <tr>\n", " <th>51</th>\n", " <td>2017-11-10</td>\n", " <td>2017-11-11 01:40:04</td>\n", " <td>7c4wus</td>\n", " <td>2017-11-10_alabama_memphis</td>\n", " <td>False</td>\n", " <td>6</td>\n", " <td>81</td>\n", " <td>[Game Thread] Memphis @ Alabama (6:30 PM ET)</td>\n", " </tr>\n", " <tr>\n", " <th>56</th>\n", " <td>2017-11-10</td>\n", " <td>2017-11-11 01:55:53</td>\n", " <td>7c4zzd</td>\n", " <td>2017-11-10_albany-ny_iona</td>\n", " <td>False</td>\n", " <td>7</td>\n", " <td>8</td>\n", " <td>[Game Thread] Iona @ Albany (7:00 EST)</td>\n", " </tr>\n", " <tr>\n", " <th>65</th>\n", " <td>2017-11-10</td>\n", " <td>2017-11-11 02:07:12</td>\n", " <td>7c528t</td>\n", " <td>2017-11-10_arizona_northern-arizona</td>\n", " <td>False</td>\n", " <td>13</td>\n", " <td>82</td>\n", " <td>[Game Thread] Northern Arizona @ #2 Arizona (8...</td>\n", " </tr>\n", " <tr>\n", " <th>108</th>\n", " <td>2017-11-10</td>\n", " <td>2017-11-11 04:45:38</td>\n", " <td>7c5x2w</td>\n", " <td>2017-11-10_austin-peay_vanderbilt</td>\n", " <td>False</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>[Game Thread] Austin Peay @ Vanderbilt (8:30 P...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " date timestamp id \\\n", "157 2017-11-10 2017-11-11 08:58:11 7c718s \n", "51 2017-11-10 2017-11-11 01:40:04 7c4wus \n", "56 2017-11-10 2017-11-11 01:55:53 7c4zzd \n", "65 2017-11-10 2017-11-11 02:07:12 7c528t \n", "108 2017-11-10 2017-11-11 04:45:38 7c5x2w \n", "\n", " gid is_postgame ups \\\n", "157 2017-11-10_alabama-birmingham_jacksonville True 9 \n", "51 2017-11-10_alabama_memphis False 6 \n", "56 2017-11-10_albany-ny_iona False 7 \n", "65 2017-11-10_arizona_northern-arizona False 13 \n", "108 2017-11-10_austin-peay_vanderbilt False 4 \n", "\n", " num_comments title \n", "157 0 [Post Game Thread] UAB defeats Jacksonville, 9... \n", "51 81 [Game Thread] Memphis @ Alabama (6:30 PM ET) \n", "56 8 [Game Thread] Iona @ Albany (7:00 EST) \n", "65 82 [Game Thread] Northern Arizona @ #2 Arizona (8... \n", "108 2 [Game Thread] Austin Peay @ Vanderbilt (8:30 P... " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_pickle('../data/gamethreads/aggregated_cleaned_2017-2020.pkl.gz', compression='gzip')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "Int64Index: 14387 entries, 157 to 19707\n", "Data columns (total 8 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 date 14387 non-null object \n", " 1 timestamp 14387 non-null datetime64[ns]\n", " 2 id 14387 non-null object \n", " 3 gid 14387 non-null object \n", " 4 is_postgame 14387 non-null bool \n", " 5 ups 14387 non-null int64 \n", " 6 num_comments 14387 non-null int64 \n", " 7 title 14387 non-null object \n", "dtypes: bool(1), datetime64[ns](1), int64(2), object(4)\n", "memory usage: 913.2+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "sql.df_to_sql(df, 'gamethreads', if_exists='replace')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reddit gamethread comments" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>post_id</th>\n", " <th>comment_id</th>\n", " <th>author</th>\n", " <th>author_flair</th>\n", " <th>text</th>\n", " <th>is_postgame</th>\n", " <th>gid</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>dsh9ec</td>\n", " <td>f6plj2x</td>\n", " <td>CaffeinationGoat</td>\n", " <td>[Connecticut Huskies, Binghamton Bearcats]</td>\n", " <td>Ah well, the annual NYS private/public ivy sho...</td>\n", " <td>True</td>\n", " <td>2019-11-06_binghamton_cornell</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>dsh9ec</td>\n", " <td>f6pa84d</td>\n", " <td>cheesoidhateself</td>\n", " <td>[Cornell Big Red]</td>\n", " <td>Consolidation of land!</td>\n", " <td>True</td>\n", " <td>2019-11-06_binghamton_cornell</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>dsh9ec</td>\n", " <td>f6qn8tq</td>\n", " <td>PAPA_JOHNS_ZIMBABWE</td>\n", " <td>[NJIT Highlanders]</td>\n", " <td>SUNY Ithaca is bringing on the pain</td>\n", " <td>True</td>\n", " <td>2019-11-06_binghamton_cornell</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>dsh9ec</td>\n", " <td>f6pbzq4</td>\n", " <td>IsYouWitItYaBish</td>\n", " <td>[Wisconsin Badgers]</td>\n", " <td>Wow what time did this game tip off?</td>\n", " <td>True</td>\n", " <td>2019-11-06_binghamton_cornell</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>dso2yc</td>\n", " <td>f6qrvlq</td>\n", " <td>mrfixit420</td>\n", " <td>[Wake Forest Demon Deacons]</td>\n", " <td>I can’t believe how dominant BC is. There are ...</td>\n", " <td>False</td>\n", " <td>2019-11-06_boston-college_wake-forest</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " post_id comment_id author \\\n", "0 dsh9ec f6plj2x CaffeinationGoat \n", "1 dsh9ec f6pa84d cheesoidhateself \n", "2 dsh9ec f6qn8tq PAPA_JOHNS_ZIMBABWE \n", "3 dsh9ec f6pbzq4 IsYouWitItYaBish \n", "4 dso2yc f6qrvlq mrfixit420 \n", "\n", " author_flair \\\n", "0 [Connecticut Huskies, Binghamton Bearcats] \n", "1 [Cornell Big Red] \n", "2 [NJIT Highlanders] \n", "3 [Wisconsin Badgers] \n", "4 [Wake Forest Demon Deacons] \n", "\n", " text is_postgame \\\n", "0 Ah well, the annual NYS private/public ivy sho... True \n", "1 Consolidation of land! True \n", "2 SUNY Ithaca is bringing on the pain True \n", "3 Wow what time did this game tip off? True \n", "4 I can’t believe how dominant BC is. There are ... False \n", "\n", " gid \n", "0 2019-11-06_binghamton_cornell \n", "1 2019-11-06_binghamton_cornell \n", "2 2019-11-06_binghamton_cornell \n", "3 2019-11-06_binghamton_cornell \n", "4 2019-11-06_boston-college_wake-forest " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfs = []\n", "for f in glob.glob('../data/gamethread_comments/*/*.pkl.gz'):\n", " dfs.append(pd.read_pickle(f, compression='gzip'))\n", "dfc = pd.concat(dfs)\n", "dfc = dfc.merge(df[['id','is_postgame','gid']], left_on='post_id', right_on='id')\n", "dfc.drop('id', axis=1, inplace=True)\n", "dfc.author = dfc.author.apply(lambda x: None if x is None else x.name)\n", "dfc.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "Int64Index: 387257 entries, 0 to 387256\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 post_id 387257 non-null object\n", " 1 comment_id 387257 non-null object\n", " 2 author 368148 non-null object\n", " 3 author_flair 387257 non-null object\n", " 4 text 387257 non-null object\n", " 5 is_postgame 387257 non-null bool \n", " 6 gid 387257 non-null object\n", "dtypes: bool(1), object(6)\n", "memory usage: 21.1+ MB\n" ] } ], "source": [ "dfc.info()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "sql.df_to_sql(dfc, 'gamethread_comments', if_exists='replace')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preseason predictions" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>year</th>\n", " <th>team_id</th>\n", " <th>final_eff</th>\n", " <th>yearm1_eff</th>\n", " <th>yearm2_eff</th>\n", " <th>wshares_lost</th>\n", " <th>wshares_return</th>\n", " <th>wshares_transfer</th>\n", " <th>espn_recruit_rank</th>\n", " <th>rsci_recruit_rank</th>\n", " <th>rsci_recruit_points</th>\n", " <th>yearm1_oeff</th>\n", " <th>yearm1_deff</th>\n", " <th>yearm1_pace</th>\n", " <th>final_pace</th>\n", " <th>recruit_score</th>\n", " <th>pred_eff</th>\n", " <th>pred_pace</th>\n", " <th>pred_oeff</th>\n", " <th>pred_deff</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2013</td>\n", " <td>air-force</td>\n", " <td>7.076</td>\n", " <td>-0.511</td>\n", " <td>5.814</td>\n", " <td>1.7</td>\n", " <td>12.0</td>\n", " <td>0.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>98.320</td>\n", " <td>98.831</td>\n", " <td>62.424</td>\n", " <td>64.534</td>\n", " <td>0</td>\n", " <td>4.862</td>\n", " <td>65.030</td>\n", " <td>101.000</td>\n", " <td>96.137</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2013</td>\n", " <td>akron</td>\n", " <td>11.615</td>\n", " <td>11.359</td>\n", " <td>3.769</td>\n", " <td>6.0</td>\n", " <td>15.8</td>\n", " <td>0.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>105.445</td>\n", " <td>94.086</td>\n", " <td>68.663</td>\n", " <td>67.137</td>\n", " <td>0</td>\n", " <td>9.664</td>\n", " <td>68.779</td>\n", " <td>104.549</td>\n", " <td>94.885</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2013</td>\n", " <td>alabama</td>\n", " <td>13.221</td>\n", " <td>18.570</td>\n", " <td>15.892</td>\n", " <td>7.8</td>\n", " <td>14.9</td>\n", " <td>0.0</td>\n", " <td>NaN</td>\n", " <td>25.0</td>\n", " <td>79</td>\n", " <td>104.865</td>\n", " <td>86.295</td>\n", " <td>63.449</td>\n", " <td>62.245</td>\n", " <td>79</td>\n", " <td>18.518</td>\n", " <td>65.646</td>\n", " <td>104.837</td>\n", " <td>86.319</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2013</td>\n", " <td>alabama-am</td>\n", " <td>-24.749</td>\n", " <td>-24.830</td>\n", " <td>-16.969</td>\n", " <td>3.2</td>\n", " <td>4.4</td>\n", " <td>0.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>87.082</td>\n", " <td>111.912</td>\n", " <td>68.426</td>\n", " <td>66.737</td>\n", " <td>0</td>\n", " <td>-20.857</td>\n", " <td>68.636</td>\n", " <td>88.820</td>\n", " <td>109.677</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>2013</td>\n", " <td>alabama-birmingham</td>\n", " <td>-0.204</td>\n", " <td>4.770</td>\n", " <td>13.027</td>\n", " <td>7.0</td>\n", " <td>7.1</td>\n", " <td>0.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>98.949</td>\n", " <td>94.178</td>\n", " <td>62.799</td>\n", " <td>71.980</td>\n", " <td>0</td>\n", " <td>5.359</td>\n", " <td>65.256</td>\n", " <td>99.250</td>\n", " <td>93.891</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " year team_id final_eff yearm1_eff yearm2_eff wshares_lost \\\n", "0 2013 air-force 7.076 -0.511 5.814 1.7 \n", "1 2013 akron 11.615 11.359 3.769 6.0 \n", "2 2013 alabama 13.221 18.570 15.892 7.8 \n", "3 2013 alabama-am -24.749 -24.830 -16.969 3.2 \n", "4 2013 alabama-birmingham -0.204 4.770 13.027 7.0 \n", "\n", " wshares_return wshares_transfer espn_recruit_rank rsci_recruit_rank \\\n", "0 12.0 0.0 NaN NaN \n", "1 15.8 0.0 NaN NaN \n", "2 14.9 0.0 NaN 25.0 \n", "3 4.4 0.0 NaN NaN \n", "4 7.1 0.0 NaN NaN \n", "\n", " rsci_recruit_points yearm1_oeff yearm1_deff yearm1_pace final_pace \\\n", "0 0 98.320 98.831 62.424 64.534 \n", "1 0 105.445 94.086 68.663 67.137 \n", "2 79 104.865 86.295 63.449 62.245 \n", "3 0 87.082 111.912 68.426 66.737 \n", "4 0 98.949 94.178 62.799 71.980 \n", "\n", " recruit_score pred_eff pred_pace pred_oeff pred_deff \n", "0 0 4.862 65.030 101.000 96.137 \n", "1 0 9.664 68.779 104.549 94.885 \n", "2 79 18.518 65.646 104.837 86.319 \n", "3 0 -20.857 68.636 88.820 109.677 \n", "4 0 5.359 65.256 99.250 93.891 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('../data/preseason_predictions.csv')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 2792 entries, 0 to 2791\n", "Data columns (total 20 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 year 2792 non-null int64 \n", " 1 team_id 2792 non-null object \n", " 2 final_eff 2792 non-null float64\n", " 3 yearm1_eff 2792 non-null float64\n", " 4 yearm2_eff 2792 non-null float64\n", " 5 wshares_lost 2792 non-null float64\n", " 6 wshares_return 2792 non-null float64\n", " 7 wshares_transfer 2792 non-null float64\n", " 8 espn_recruit_rank 288 non-null float64\n", " 9 rsci_recruit_rank 404 non-null float64\n", " 10 rsci_recruit_points 2792 non-null int64 \n", " 11 yearm1_oeff 2792 non-null float64\n", " 12 yearm1_deff 2792 non-null float64\n", " 13 yearm1_pace 2792 non-null float64\n", " 14 final_pace 2792 non-null float64\n", " 15 recruit_score 2792 non-null int64 \n", " 16 pred_eff 2792 non-null float64\n", " 17 pred_pace 2792 non-null float64\n", " 18 pred_oeff 2792 non-null float64\n", " 19 pred_deff 2792 non-null float64\n", "dtypes: float64(16), int64(3), object(1)\n", "memory usage: 436.4+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "sql.df_to_sql(df, 'preseason_predictions', if_exists='replace')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }