{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert CSV/DataFrames to PostgreSQL tables\n",
    "We use a single database \"cbb\" to store various tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import glob\n",
    "import datetime as dt\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from watchcbb.sql import SQLEngine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = SQLEngine('cbb')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Connect to 'cbb' database"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Start with simple teams.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>team_id</th>\n",
       "      <th>display_name</th>\n",
       "      <th>flair_name</th>\n",
       "      <th>conference</th>\n",
       "      <th>location</th>\n",
       "      <th>year_start</th>\n",
       "      <th>year_end</th>\n",
       "      <th>color</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>abilene-christian</td>\n",
       "      <td>Abilene Christian</td>\n",
       "      <td>Abilene Christian Wildcats</td>\n",
       "      <td>Southland</td>\n",
       "      <td>Abilene, Texas</td>\n",
       "      <td>1971</td>\n",
       "      <td>2020</td>\n",
       "      <td>#4e2583</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>air-force</td>\n",
       "      <td>Air Force</td>\n",
       "      <td>Air Force Falcons</td>\n",
       "      <td>MWC</td>\n",
       "      <td>USAF Academy, Colorado</td>\n",
       "      <td>1958</td>\n",
       "      <td>2020</td>\n",
       "      <td>#0061aa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>akron</td>\n",
       "      <td>Akron</td>\n",
       "      <td>Akron Zips</td>\n",
       "      <td>MAC</td>\n",
       "      <td>Akron, Ohio</td>\n",
       "      <td>1902</td>\n",
       "      <td>2020</td>\n",
       "      <td>#a89968</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>alabama-am</td>\n",
       "      <td>Alabama A&amp;M</td>\n",
       "      <td>NaN</td>\n",
       "      <td>SWAC</td>\n",
       "      <td>Normal, Alabama</td>\n",
       "      <td>2000</td>\n",
       "      <td>2020</td>\n",
       "      <td>#661012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>alabama</td>\n",
       "      <td>Alabama</td>\n",
       "      <td>Alabama Crimson Tide</td>\n",
       "      <td>SEC</td>\n",
       "      <td>Tuscaloosa, Alabama</td>\n",
       "      <td>1913</td>\n",
       "      <td>2020</td>\n",
       "      <td>#aa1c37</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             team_id       display_name                  flair_name  \\\n",
       "0  abilene-christian  Abilene Christian  Abilene Christian Wildcats   \n",
       "1          air-force          Air Force           Air Force Falcons   \n",
       "2              akron              Akron                  Akron Zips   \n",
       "3         alabama-am        Alabama A&M                         NaN   \n",
       "4            alabama            Alabama        Alabama Crimson Tide   \n",
       "\n",
       "  conference                location  year_start  year_end    color  \n",
       "0  Southland          Abilene, Texas        1971      2020  #4e2583  \n",
       "1        MWC  USAF Academy, Colorado        1958      2020  #0061aa  \n",
       "2        MAC             Akron, Ohio        1902      2020  #a89968  \n",
       "3       SWAC         Normal, Alabama        2000      2020  #661012  \n",
       "4        SEC     Tuscaloosa, Alabama        1913      2020  #aa1c37  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('../data/teams.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 357 entries, 0 to 356\n",
      "Data columns (total 8 columns):\n",
      " #   Column        Non-Null Count  Dtype \n",
      "---  ------        --------------  ----- \n",
      " 0   team_id       357 non-null    object\n",
      " 1   display_name  357 non-null    object\n",
      " 2   flair_name    295 non-null    object\n",
      " 3   conference    357 non-null    object\n",
      " 4   location      357 non-null    object\n",
      " 5   year_start    357 non-null    int64 \n",
      " 6   year_end      357 non-null    int64 \n",
      " 7   color         354 non-null    object\n",
      "dtypes: int64(2), object(6)\n",
      "memory usage: 22.4+ KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql.df_to_sql(df, 'teams', if_exists='replace')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### AP Rankings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'list'>\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "160    2018-12-17\n",
       "161    2018-12-24\n",
       "162    2018-12-31\n",
       "163    2019-01-07\n",
       "164    2019-01-14\n",
       "165    2019-01-21\n",
       "166    2019-01-28\n",
       "167    2019-02-04\n",
       "168    2019-02-11\n",
       "169    2019-02-18\n",
       "Name: date, dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ap = pd.read_csv('../data/ap_rankings.csv')\n",
    "# they are read in as strings, so convert to lists\n",
    "for i in range(1,26):\n",
    "    df_ap['r'+str(i)] = df_ap['r'+str(i)].apply(eval)\n",
    "df_ap.date = df_ap.date.apply(lambda x:dt.date(*[int(x) for x in x.split('-')]))\n",
    "print(type(df_ap.r1[0]))\n",
    "df_ap.date.iloc[160:170]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 194 entries, 0 to 193\n",
      "Data columns (total 26 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   date    194 non-null    object\n",
      " 1   r1      194 non-null    object\n",
      " 2   r2      194 non-null    object\n",
      " 3   r3      194 non-null    object\n",
      " 4   r4      194 non-null    object\n",
      " 5   r5      194 non-null    object\n",
      " 6   r6      194 non-null    object\n",
      " 7   r7      194 non-null    object\n",
      " 8   r8      194 non-null    object\n",
      " 9   r9      194 non-null    object\n",
      " 10  r10     194 non-null    object\n",
      " 11  r11     194 non-null    object\n",
      " 12  r12     194 non-null    object\n",
      " 13  r13     194 non-null    object\n",
      " 14  r14     194 non-null    object\n",
      " 15  r15     194 non-null    object\n",
      " 16  r16     194 non-null    object\n",
      " 17  r17     194 non-null    object\n",
      " 18  r18     194 non-null    object\n",
      " 19  r19     194 non-null    object\n",
      " 20  r20     194 non-null    object\n",
      " 21  r21     194 non-null    object\n",
      " 22  r22     194 non-null    object\n",
      " 23  r23     194 non-null    object\n",
      " 24  r24     194 non-null    object\n",
      " 25  r25     194 non-null    object\n",
      "dtypes: object(26)\n",
      "memory usage: 39.5+ KB\n"
     ]
    }
   ],
   "source": [
    "df_ap.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql.df_to_sql(df_ap, 'ap_rankings', if_exists='replace')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Game-by-game data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape: (54760, 39)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Season</th>\n",
       "      <th>Date</th>\n",
       "      <th>gid</th>\n",
       "      <th>Type</th>\n",
       "      <th>WTeamID</th>\n",
       "      <th>WScore</th>\n",
       "      <th>LTeamID</th>\n",
       "      <th>LScore</th>\n",
       "      <th>Wrank</th>\n",
       "      <th>Lrank</th>\n",
       "      <th>...</th>\n",
       "      <th>LFTM</th>\n",
       "      <th>LFTA</th>\n",
       "      <th>LOR</th>\n",
       "      <th>LDR</th>\n",
       "      <th>LAst</th>\n",
       "      <th>LTO</th>\n",
       "      <th>LStl</th>\n",
       "      <th>LBlk</th>\n",
       "      <th>LPF</th>\n",
       "      <th>poss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2011</td>\n",
       "      <td>2010-11-08</td>\n",
       "      <td>2010-11-08_california-irvine_illinois</td>\n",
       "      <td>RG</td>\n",
       "      <td>illinois</td>\n",
       "      <td>79</td>\n",
       "      <td>california-irvine</td>\n",
       "      <td>65</td>\n",
       "      <td>13</td>\n",
       "      <td>-1</td>\n",
       "      <td>...</td>\n",
       "      <td>14</td>\n",
       "      <td>22</td>\n",
       "      <td>11</td>\n",
       "      <td>26</td>\n",
       "      <td>12</td>\n",
       "      <td>14</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>21</td>\n",
       "      <td>74.90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2011</td>\n",
       "      <td>2010-11-08</td>\n",
       "      <td>2010-11-08_pittsburgh_rhode-island</td>\n",
       "      <td>RG</td>\n",
       "      <td>pittsburgh</td>\n",
       "      <td>83</td>\n",
       "      <td>rhode-island</td>\n",
       "      <td>75</td>\n",
       "      <td>5</td>\n",
       "      <td>-1</td>\n",
       "      <td>...</td>\n",
       "      <td>13</td>\n",
       "      <td>16</td>\n",
       "      <td>9</td>\n",
       "      <td>20</td>\n",
       "      <td>16</td>\n",
       "      <td>16</td>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>25</td>\n",
       "      <td>75.62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2011</td>\n",
       "      <td>2010-11-08</td>\n",
       "      <td>2010-11-08_maryland_seattle</td>\n",
       "      <td>RG</td>\n",
       "      <td>maryland</td>\n",
       "      <td>105</td>\n",
       "      <td>seattle</td>\n",
       "      <td>76</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>...</td>\n",
       "      <td>17</td>\n",
       "      <td>22</td>\n",
       "      <td>13</td>\n",
       "      <td>16</td>\n",
       "      <td>11</td>\n",
       "      <td>27</td>\n",
       "      <td>18</td>\n",
       "      <td>1</td>\n",
       "      <td>31</td>\n",
       "      <td>98.36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2011</td>\n",
       "      <td>2010-11-08</td>\n",
       "      <td>2010-11-08_navy_texas</td>\n",
       "      <td>RG</td>\n",
       "      <td>texas</td>\n",
       "      <td>83</td>\n",
       "      <td>navy</td>\n",
       "      <td>52</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>...</td>\n",
       "      <td>10</td>\n",
       "      <td>18</td>\n",
       "      <td>5</td>\n",
       "      <td>21</td>\n",
       "      <td>8</td>\n",
       "      <td>16</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>24</td>\n",
       "      <td>80.44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2011</td>\n",
       "      <td>2010-11-10</td>\n",
       "      <td>2010-11-10_college-of-charleston_maryland</td>\n",
       "      <td>RG</td>\n",
       "      <td>maryland</td>\n",
       "      <td>75</td>\n",
       "      <td>college-of-charleston</td>\n",
       "      <td>74</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>...</td>\n",
       "      <td>16</td>\n",
       "      <td>19</td>\n",
       "      <td>10</td>\n",
       "      <td>25</td>\n",
       "      <td>12</td>\n",
       "      <td>22</td>\n",
       "      <td>8</td>\n",
       "      <td>8</td>\n",
       "      <td>19</td>\n",
       "      <td>77.64</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2011</td>\n",
       "      <td>2010-11-10</td>\n",
       "      <td>2010-11-10_illinois-chicago_pittsburgh</td>\n",
       "      <td>RG</td>\n",
       "      <td>pittsburgh</td>\n",
       "      <td>97</td>\n",
       "      <td>illinois-chicago</td>\n",
       "      <td>54</td>\n",
       "      <td>5</td>\n",
       "      <td>-1</td>\n",
       "      <td>...</td>\n",
       "      <td>7</td>\n",
       "      <td>9</td>\n",
       "      <td>5</td>\n",
       "      <td>15</td>\n",
       "      <td>9</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>18</td>\n",
       "      <td>67.88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2011</td>\n",
       "      <td>2010-11-10</td>\n",
       "      <td>2010-11-10_illinois_toledo</td>\n",
       "      <td>RG</td>\n",
       "      <td>illinois</td>\n",
       "      <td>84</td>\n",
       "      <td>toledo</td>\n",
       "      <td>45</td>\n",
       "      <td>13</td>\n",
       "      <td>-1</td>\n",
       "      <td>...</td>\n",
       "      <td>10</td>\n",
       "      <td>19</td>\n",
       "      <td>3</td>\n",
       "      <td>16</td>\n",
       "      <td>9</td>\n",
       "      <td>23</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>72.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2011</td>\n",
       "      <td>2010-11-10</td>\n",
       "      <td>2010-11-10_louisiana-tech_texas</td>\n",
       "      <td>RG</td>\n",
       "      <td>texas</td>\n",
       "      <td>89</td>\n",
       "      <td>louisiana-tech</td>\n",
       "      <td>58</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>...</td>\n",
       "      <td>13</td>\n",
       "      <td>20</td>\n",
       "      <td>13</td>\n",
       "      <td>25</td>\n",
       "      <td>4</td>\n",
       "      <td>22</td>\n",
       "      <td>6</td>\n",
       "      <td>5</td>\n",
       "      <td>21</td>\n",
       "      <td>83.74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2011</td>\n",
       "      <td>2010-11-12</td>\n",
       "      <td>2010-11-12_maryland-eastern-shore_tulane</td>\n",
       "      <td>RG</td>\n",
       "      <td>tulane</td>\n",
       "      <td>91</td>\n",
       "      <td>maryland-eastern-shore</td>\n",
       "      <td>62</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>...</td>\n",
       "      <td>19</td>\n",
       "      <td>26</td>\n",
       "      <td>9</td>\n",
       "      <td>16</td>\n",
       "      <td>11</td>\n",
       "      <td>21</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>28</td>\n",
       "      <td>78.04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2011</td>\n",
       "      <td>2010-11-12</td>\n",
       "      <td>2010-11-12_marquette_prairie-view</td>\n",
       "      <td>RG</td>\n",
       "      <td>marquette</td>\n",
       "      <td>97</td>\n",
       "      <td>prairie-view</td>\n",
       "      <td>58</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>...</td>\n",
       "      <td>7</td>\n",
       "      <td>11</td>\n",
       "      <td>3</td>\n",
       "      <td>13</td>\n",
       "      <td>12</td>\n",
       "      <td>20</td>\n",
       "      <td>11</td>\n",
       "      <td>3</td>\n",
       "      <td>27</td>\n",
       "      <td>79.12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10 rows × 39 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Season        Date                                        gid Type  \\\n",
       "0    2011  2010-11-08      2010-11-08_california-irvine_illinois   RG   \n",
       "1    2011  2010-11-08         2010-11-08_pittsburgh_rhode-island   RG   \n",
       "2    2011  2010-11-08                2010-11-08_maryland_seattle   RG   \n",
       "3    2011  2010-11-08                      2010-11-08_navy_texas   RG   \n",
       "4    2011  2010-11-10  2010-11-10_college-of-charleston_maryland   RG   \n",
       "5    2011  2010-11-10     2010-11-10_illinois-chicago_pittsburgh   RG   \n",
       "6    2011  2010-11-10                 2010-11-10_illinois_toledo   RG   \n",
       "7    2011  2010-11-10            2010-11-10_louisiana-tech_texas   RG   \n",
       "8    2011  2010-11-12   2010-11-12_maryland-eastern-shore_tulane   RG   \n",
       "9    2011  2010-11-12          2010-11-12_marquette_prairie-view   RG   \n",
       "\n",
       "      WTeamID  WScore                 LTeamID  LScore  Wrank  Lrank  ... LFTM  \\\n",
       "0    illinois      79       california-irvine      65     13     -1  ...   14   \n",
       "1  pittsburgh      83            rhode-island      75      5     -1  ...   13   \n",
       "2    maryland     105                 seattle      76     -1     -1  ...   17   \n",
       "3       texas      83                    navy      52     -1     -1  ...   10   \n",
       "4    maryland      75   college-of-charleston      74     -1     -1  ...   16   \n",
       "5  pittsburgh      97        illinois-chicago      54      5     -1  ...    7   \n",
       "6    illinois      84                  toledo      45     13     -1  ...   10   \n",
       "7       texas      89          louisiana-tech      58     -1     -1  ...   13   \n",
       "8      tulane      91  maryland-eastern-shore      62     -1     -1  ...   19   \n",
       "9   marquette      97            prairie-view      58     -1     -1  ...    7   \n",
       "\n",
       "   LFTA  LOR  LDR  LAst  LTO  LStl  LBlk  LPF   poss  \n",
       "0    22   11   26    12   14     7     1   21  74.90  \n",
       "1    16    9   20    16   16     7     6   25  75.62  \n",
       "2    22   13   16    11   27    18     1   31  98.36  \n",
       "3    18    5   21     8   16     5     5   24  80.44  \n",
       "4    19   10   25    12   22     8     8   19  77.64  \n",
       "5     9    5   15     9   11     0     3   18  67.88  \n",
       "6    19    3   16     9   23     6     0   14  72.00  \n",
       "7    20   13   25     4   22     6     5   21  83.74  \n",
       "8    26    9   16    11   21     4     2   28  78.04  \n",
       "9    11    3   13    12   20    11     3   27  79.12  \n",
       "\n",
       "[10 rows x 39 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_gid(row):\n",
    "    tid1, tid2 = sorted((row.WTeamID, row.LTeamID))\n",
    "    return '{0}_{1}_{2}'.format(row.Date, tid1, tid2)\n",
    "\n",
    "def get_ap_ranks(row):\n",
    "    date = row.Date\n",
    "    wid, lid = row.WTeamID, row.LTeamID\n",
    "    idx = np.argmax(date < df_ap.date) - 1\n",
    "    r1, r2 = -1, -1\n",
    "    ranks = df_ap.iloc[idx].values[1:].tolist()\n",
    "    for irank,ts in enumerate(ranks):\n",
    "        if wid in ts:\n",
    "            r1 = irank+1\n",
    "        if lid in ts:\n",
    "            r2 = irank+1\n",
    "    return r1,r2\n",
    "\n",
    "dfs = []\n",
    "for fname in glob.glob(\"../data/game_data/*.csv\"):\n",
    "    dfs.append(pd.read_csv(fname))\n",
    "df = pd.concat(dfs)\n",
    "df.Date = df.Date.apply(lambda x:dt.date(*[int(x) for x in x.split('-')]))\n",
    "df = df.sort_values(\"Date\").reset_index(drop=True)\n",
    "df[\"poss\"] = 0.5*(df[\"WFGA\"] + 0.44*df[\"WFTA\"] - df[\"WOR\"] + df[\"WTO\"] + df[\"LFGA\"] + 0.44*df[\"LFTA\"] - df[\"LOR\"] + df[\"LTO\"])\n",
    "df.insert(2, \"gid\", df.apply(get_gid, axis=1))\n",
    "apranks = df.apply(get_ap_ranks, axis=1).values\n",
    "df.insert(8, \"Wrank\", [x[0] for x in apranks])\n",
    "df.insert(9, \"Lrank\", [x[1] for x in apranks])\n",
    "print(\"Shape:\",df.shape)\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 54760 entries, 0 to 54759\n",
      "Data columns (total 39 columns):\n",
      " #   Column   Non-Null Count  Dtype  \n",
      "---  ------   --------------  -----  \n",
      " 0   Season   54760 non-null  int64  \n",
      " 1   Date     54760 non-null  object \n",
      " 2   gid      54760 non-null  object \n",
      " 3   Type     54760 non-null  object \n",
      " 4   WTeamID  54760 non-null  object \n",
      " 5   WScore   54760 non-null  int64  \n",
      " 6   LTeamID  54760 non-null  object \n",
      " 7   LScore   54760 non-null  int64  \n",
      " 8   Wrank    54760 non-null  int64  \n",
      " 9   Lrank    54760 non-null  int64  \n",
      " 10  WLoc     54760 non-null  object \n",
      " 11  NumOT    54760 non-null  int64  \n",
      " 12  WFGM     54760 non-null  int64  \n",
      " 13  WFGA     54760 non-null  int64  \n",
      " 14  WFGM3    54760 non-null  int64  \n",
      " 15  WFGA3    54760 non-null  int64  \n",
      " 16  WFTM     54760 non-null  int64  \n",
      " 17  WFTA     54760 non-null  int64  \n",
      " 18  WOR      54760 non-null  int64  \n",
      " 19  WDR      54760 non-null  int64  \n",
      " 20  WAst     54760 non-null  int64  \n",
      " 21  WTO      54760 non-null  int64  \n",
      " 22  WStl     54760 non-null  int64  \n",
      " 23  WBlk     54760 non-null  int64  \n",
      " 24  WPF      54760 non-null  int64  \n",
      " 25  LFGM     54760 non-null  int64  \n",
      " 26  LFGA     54760 non-null  int64  \n",
      " 27  LFGM3    54760 non-null  int64  \n",
      " 28  LFGA3    54760 non-null  int64  \n",
      " 29  LFTM     54760 non-null  int64  \n",
      " 30  LFTA     54760 non-null  int64  \n",
      " 31  LOR      54760 non-null  int64  \n",
      " 32  LDR      54760 non-null  int64  \n",
      " 33  LAst     54760 non-null  int64  \n",
      " 34  LTO      54760 non-null  int64  \n",
      " 35  LStl     54760 non-null  int64  \n",
      " 36  LBlk     54760 non-null  int64  \n",
      " 37  LPF      54760 non-null  int64  \n",
      " 38  poss     54760 non-null  float64\n",
      "dtypes: float64(1), int64(32), object(6)\n",
      "memory usage: 16.3+ MB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql.df_to_sql(df, 'game_data', if_exists='replace')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reddit submissions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>id</th>\n",
       "      <th>gid</th>\n",
       "      <th>is_postgame</th>\n",
       "      <th>ups</th>\n",
       "      <th>num_comments</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>157</th>\n",
       "      <td>2017-11-10</td>\n",
       "      <td>2017-11-11 08:58:11</td>\n",
       "      <td>7c718s</td>\n",
       "      <td>2017-11-10_alabama-birmingham_jacksonville</td>\n",
       "      <td>True</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>[Post Game Thread] UAB defeats Jacksonville, 9...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>2017-11-10</td>\n",
       "      <td>2017-11-11 01:40:04</td>\n",
       "      <td>7c4wus</td>\n",
       "      <td>2017-11-10_alabama_memphis</td>\n",
       "      <td>False</td>\n",
       "      <td>6</td>\n",
       "      <td>81</td>\n",
       "      <td>[Game Thread] Memphis @ Alabama (6:30 PM ET)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56</th>\n",
       "      <td>2017-11-10</td>\n",
       "      <td>2017-11-11 01:55:53</td>\n",
       "      <td>7c4zzd</td>\n",
       "      <td>2017-11-10_albany-ny_iona</td>\n",
       "      <td>False</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>[Game Thread] Iona @ Albany (7:00 EST)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>2017-11-10</td>\n",
       "      <td>2017-11-11 02:07:12</td>\n",
       "      <td>7c528t</td>\n",
       "      <td>2017-11-10_arizona_northern-arizona</td>\n",
       "      <td>False</td>\n",
       "      <td>13</td>\n",
       "      <td>82</td>\n",
       "      <td>[Game Thread] Northern Arizona @ #2 Arizona (8...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>2017-11-10</td>\n",
       "      <td>2017-11-11 04:45:38</td>\n",
       "      <td>7c5x2w</td>\n",
       "      <td>2017-11-10_austin-peay_vanderbilt</td>\n",
       "      <td>False</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>[Game Thread] Austin Peay @ Vanderbilt (8:30 P...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           date           timestamp      id  \\\n",
       "157  2017-11-10 2017-11-11 08:58:11  7c718s   \n",
       "51   2017-11-10 2017-11-11 01:40:04  7c4wus   \n",
       "56   2017-11-10 2017-11-11 01:55:53  7c4zzd   \n",
       "65   2017-11-10 2017-11-11 02:07:12  7c528t   \n",
       "108  2017-11-10 2017-11-11 04:45:38  7c5x2w   \n",
       "\n",
       "                                            gid  is_postgame  ups  \\\n",
       "157  2017-11-10_alabama-birmingham_jacksonville         True    9   \n",
       "51                   2017-11-10_alabama_memphis        False    6   \n",
       "56                    2017-11-10_albany-ny_iona        False    7   \n",
       "65          2017-11-10_arizona_northern-arizona        False   13   \n",
       "108           2017-11-10_austin-peay_vanderbilt        False    4   \n",
       "\n",
       "     num_comments                                              title  \n",
       "157             0  [Post Game Thread] UAB defeats Jacksonville, 9...  \n",
       "51             81       [Game Thread] Memphis @ Alabama (6:30 PM ET)  \n",
       "56              8             [Game Thread] Iona @ Albany (7:00 EST)  \n",
       "65             82  [Game Thread] Northern Arizona @ #2 Arizona (8...  \n",
       "108             2  [Game Thread] Austin Peay @ Vanderbilt (8:30 P...  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_pickle('../data/gamethreads/aggregated_cleaned_2017-2020.pkl.gz', compression='gzip')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 14387 entries, 157 to 19707\n",
      "Data columns (total 8 columns):\n",
      " #   Column        Non-Null Count  Dtype         \n",
      "---  ------        --------------  -----         \n",
      " 0   date          14387 non-null  object        \n",
      " 1   timestamp     14387 non-null  datetime64[ns]\n",
      " 2   id            14387 non-null  object        \n",
      " 3   gid           14387 non-null  object        \n",
      " 4   is_postgame   14387 non-null  bool          \n",
      " 5   ups           14387 non-null  int64         \n",
      " 6   num_comments  14387 non-null  int64         \n",
      " 7   title         14387 non-null  object        \n",
      "dtypes: bool(1), datetime64[ns](1), int64(2), object(4)\n",
      "memory usage: 913.2+ KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql.df_to_sql(df, 'gamethreads', if_exists='replace')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reddit gamethread comments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>post_id</th>\n",
       "      <th>comment_id</th>\n",
       "      <th>author</th>\n",
       "      <th>author_flair</th>\n",
       "      <th>text</th>\n",
       "      <th>is_postgame</th>\n",
       "      <th>gid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>dsh9ec</td>\n",
       "      <td>f6plj2x</td>\n",
       "      <td>CaffeinationGoat</td>\n",
       "      <td>[Connecticut Huskies, Binghamton Bearcats]</td>\n",
       "      <td>Ah well, the annual NYS private/public ivy sho...</td>\n",
       "      <td>True</td>\n",
       "      <td>2019-11-06_binghamton_cornell</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>dsh9ec</td>\n",
       "      <td>f6pa84d</td>\n",
       "      <td>cheesoidhateself</td>\n",
       "      <td>[Cornell Big Red]</td>\n",
       "      <td>Consolidation of land!</td>\n",
       "      <td>True</td>\n",
       "      <td>2019-11-06_binghamton_cornell</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>dsh9ec</td>\n",
       "      <td>f6qn8tq</td>\n",
       "      <td>PAPA_JOHNS_ZIMBABWE</td>\n",
       "      <td>[NJIT Highlanders]</td>\n",
       "      <td>SUNY Ithaca is bringing on the pain</td>\n",
       "      <td>True</td>\n",
       "      <td>2019-11-06_binghamton_cornell</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>dsh9ec</td>\n",
       "      <td>f6pbzq4</td>\n",
       "      <td>IsYouWitItYaBish</td>\n",
       "      <td>[Wisconsin Badgers]</td>\n",
       "      <td>Wow what time did this game tip off?</td>\n",
       "      <td>True</td>\n",
       "      <td>2019-11-06_binghamton_cornell</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>dso2yc</td>\n",
       "      <td>f6qrvlq</td>\n",
       "      <td>mrfixit420</td>\n",
       "      <td>[Wake Forest Demon Deacons]</td>\n",
       "      <td>I can’t believe how dominant BC is. There are ...</td>\n",
       "      <td>False</td>\n",
       "      <td>2019-11-06_boston-college_wake-forest</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  post_id comment_id               author  \\\n",
       "0  dsh9ec    f6plj2x     CaffeinationGoat   \n",
       "1  dsh9ec    f6pa84d     cheesoidhateself   \n",
       "2  dsh9ec    f6qn8tq  PAPA_JOHNS_ZIMBABWE   \n",
       "3  dsh9ec    f6pbzq4     IsYouWitItYaBish   \n",
       "4  dso2yc    f6qrvlq           mrfixit420   \n",
       "\n",
       "                                 author_flair  \\\n",
       "0  [Connecticut Huskies, Binghamton Bearcats]   \n",
       "1                           [Cornell Big Red]   \n",
       "2                          [NJIT Highlanders]   \n",
       "3                         [Wisconsin Badgers]   \n",
       "4                 [Wake Forest Demon Deacons]   \n",
       "\n",
       "                                                text  is_postgame  \\\n",
       "0  Ah well, the annual NYS private/public ivy sho...         True   \n",
       "1                             Consolidation of land!         True   \n",
       "2                SUNY Ithaca is bringing on the pain         True   \n",
       "3               Wow what time did this game tip off?         True   \n",
       "4  I can’t believe how dominant BC is. There are ...        False   \n",
       "\n",
       "                                     gid  \n",
       "0          2019-11-06_binghamton_cornell  \n",
       "1          2019-11-06_binghamton_cornell  \n",
       "2          2019-11-06_binghamton_cornell  \n",
       "3          2019-11-06_binghamton_cornell  \n",
       "4  2019-11-06_boston-college_wake-forest  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfs = []\n",
    "for f in glob.glob('../data/gamethread_comments/*/*.pkl.gz'):\n",
    "    dfs.append(pd.read_pickle(f, compression='gzip'))\n",
    "dfc = pd.concat(dfs)\n",
    "dfc = dfc.merge(df[['id','is_postgame','gid']], left_on='post_id', right_on='id')\n",
    "dfc.drop('id', axis=1, inplace=True)\n",
    "dfc.author = dfc.author.apply(lambda x: None if x is None else x.name)\n",
    "dfc.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 387257 entries, 0 to 387256\n",
      "Data columns (total 7 columns):\n",
      " #   Column        Non-Null Count   Dtype \n",
      "---  ------        --------------   ----- \n",
      " 0   post_id       387257 non-null  object\n",
      " 1   comment_id    387257 non-null  object\n",
      " 2   author        368148 non-null  object\n",
      " 3   author_flair  387257 non-null  object\n",
      " 4   text          387257 non-null  object\n",
      " 5   is_postgame   387257 non-null  bool  \n",
      " 6   gid           387257 non-null  object\n",
      "dtypes: bool(1), object(6)\n",
      "memory usage: 21.1+ MB\n"
     ]
    }
   ],
   "source": [
    "dfc.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql.df_to_sql(dfc, 'gamethread_comments', if_exists='replace')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preseason predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>year</th>\n",
       "      <th>team_id</th>\n",
       "      <th>final_eff</th>\n",
       "      <th>yearm1_eff</th>\n",
       "      <th>yearm2_eff</th>\n",
       "      <th>wshares_lost</th>\n",
       "      <th>wshares_return</th>\n",
       "      <th>wshares_transfer</th>\n",
       "      <th>espn_recruit_rank</th>\n",
       "      <th>rsci_recruit_rank</th>\n",
       "      <th>rsci_recruit_points</th>\n",
       "      <th>yearm1_oeff</th>\n",
       "      <th>yearm1_deff</th>\n",
       "      <th>yearm1_pace</th>\n",
       "      <th>final_pace</th>\n",
       "      <th>recruit_score</th>\n",
       "      <th>pred_eff</th>\n",
       "      <th>pred_pace</th>\n",
       "      <th>pred_oeff</th>\n",
       "      <th>pred_deff</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2013</td>\n",
       "      <td>air-force</td>\n",
       "      <td>7.076</td>\n",
       "      <td>-0.511</td>\n",
       "      <td>5.814</td>\n",
       "      <td>1.7</td>\n",
       "      <td>12.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>98.320</td>\n",
       "      <td>98.831</td>\n",
       "      <td>62.424</td>\n",
       "      <td>64.534</td>\n",
       "      <td>0</td>\n",
       "      <td>4.862</td>\n",
       "      <td>65.030</td>\n",
       "      <td>101.000</td>\n",
       "      <td>96.137</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2013</td>\n",
       "      <td>akron</td>\n",
       "      <td>11.615</td>\n",
       "      <td>11.359</td>\n",
       "      <td>3.769</td>\n",
       "      <td>6.0</td>\n",
       "      <td>15.8</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>105.445</td>\n",
       "      <td>94.086</td>\n",
       "      <td>68.663</td>\n",
       "      <td>67.137</td>\n",
       "      <td>0</td>\n",
       "      <td>9.664</td>\n",
       "      <td>68.779</td>\n",
       "      <td>104.549</td>\n",
       "      <td>94.885</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2013</td>\n",
       "      <td>alabama</td>\n",
       "      <td>13.221</td>\n",
       "      <td>18.570</td>\n",
       "      <td>15.892</td>\n",
       "      <td>7.8</td>\n",
       "      <td>14.9</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25.0</td>\n",
       "      <td>79</td>\n",
       "      <td>104.865</td>\n",
       "      <td>86.295</td>\n",
       "      <td>63.449</td>\n",
       "      <td>62.245</td>\n",
       "      <td>79</td>\n",
       "      <td>18.518</td>\n",
       "      <td>65.646</td>\n",
       "      <td>104.837</td>\n",
       "      <td>86.319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2013</td>\n",
       "      <td>alabama-am</td>\n",
       "      <td>-24.749</td>\n",
       "      <td>-24.830</td>\n",
       "      <td>-16.969</td>\n",
       "      <td>3.2</td>\n",
       "      <td>4.4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>87.082</td>\n",
       "      <td>111.912</td>\n",
       "      <td>68.426</td>\n",
       "      <td>66.737</td>\n",
       "      <td>0</td>\n",
       "      <td>-20.857</td>\n",
       "      <td>68.636</td>\n",
       "      <td>88.820</td>\n",
       "      <td>109.677</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2013</td>\n",
       "      <td>alabama-birmingham</td>\n",
       "      <td>-0.204</td>\n",
       "      <td>4.770</td>\n",
       "      <td>13.027</td>\n",
       "      <td>7.0</td>\n",
       "      <td>7.1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>98.949</td>\n",
       "      <td>94.178</td>\n",
       "      <td>62.799</td>\n",
       "      <td>71.980</td>\n",
       "      <td>0</td>\n",
       "      <td>5.359</td>\n",
       "      <td>65.256</td>\n",
       "      <td>99.250</td>\n",
       "      <td>93.891</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   year             team_id  final_eff  yearm1_eff  yearm2_eff  wshares_lost  \\\n",
       "0  2013           air-force      7.076      -0.511       5.814           1.7   \n",
       "1  2013               akron     11.615      11.359       3.769           6.0   \n",
       "2  2013             alabama     13.221      18.570      15.892           7.8   \n",
       "3  2013          alabama-am    -24.749     -24.830     -16.969           3.2   \n",
       "4  2013  alabama-birmingham     -0.204       4.770      13.027           7.0   \n",
       "\n",
       "   wshares_return  wshares_transfer  espn_recruit_rank  rsci_recruit_rank  \\\n",
       "0            12.0               0.0                NaN                NaN   \n",
       "1            15.8               0.0                NaN                NaN   \n",
       "2            14.9               0.0                NaN               25.0   \n",
       "3             4.4               0.0                NaN                NaN   \n",
       "4             7.1               0.0                NaN                NaN   \n",
       "\n",
       "   rsci_recruit_points  yearm1_oeff  yearm1_deff  yearm1_pace  final_pace  \\\n",
       "0                    0       98.320       98.831       62.424      64.534   \n",
       "1                    0      105.445       94.086       68.663      67.137   \n",
       "2                   79      104.865       86.295       63.449      62.245   \n",
       "3                    0       87.082      111.912       68.426      66.737   \n",
       "4                    0       98.949       94.178       62.799      71.980   \n",
       "\n",
       "   recruit_score  pred_eff  pred_pace  pred_oeff  pred_deff  \n",
       "0              0     4.862     65.030    101.000     96.137  \n",
       "1              0     9.664     68.779    104.549     94.885  \n",
       "2             79    18.518     65.646    104.837     86.319  \n",
       "3              0   -20.857     68.636     88.820    109.677  \n",
       "4              0     5.359     65.256     99.250     93.891  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('../data/preseason_predictions.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 2792 entries, 0 to 2791\n",
      "Data columns (total 20 columns):\n",
      " #   Column               Non-Null Count  Dtype  \n",
      "---  ------               --------------  -----  \n",
      " 0   year                 2792 non-null   int64  \n",
      " 1   team_id              2792 non-null   object \n",
      " 2   final_eff            2792 non-null   float64\n",
      " 3   yearm1_eff           2792 non-null   float64\n",
      " 4   yearm2_eff           2792 non-null   float64\n",
      " 5   wshares_lost         2792 non-null   float64\n",
      " 6   wshares_return       2792 non-null   float64\n",
      " 7   wshares_transfer     2792 non-null   float64\n",
      " 8   espn_recruit_rank    288 non-null    float64\n",
      " 9   rsci_recruit_rank    404 non-null    float64\n",
      " 10  rsci_recruit_points  2792 non-null   int64  \n",
      " 11  yearm1_oeff          2792 non-null   float64\n",
      " 12  yearm1_deff          2792 non-null   float64\n",
      " 13  yearm1_pace          2792 non-null   float64\n",
      " 14  final_pace           2792 non-null   float64\n",
      " 15  recruit_score        2792 non-null   int64  \n",
      " 16  pred_eff             2792 non-null   float64\n",
      " 17  pred_pace            2792 non-null   float64\n",
      " 18  pred_oeff            2792 non-null   float64\n",
      " 19  pred_deff            2792 non-null   float64\n",
      "dtypes: float64(16), int64(3), object(1)\n",
      "memory usage: 436.4+ KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "sql.df_to_sql(df, 'preseason_predictions', if_exists='replace')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}