{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Exploration du Dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "filePath = \"../Datasets/advertising.csv\"\n", "df = pd.read_csv(filePath)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(200, 5)" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TVRadioNewspaperSales
0230.137.869.222.1
144.539.345.110.4
217.245.969.39.3
3151.541.358.518.5
4180.810.858.412.9
\n", "
" ], "text/plain": [ " TV Radio Newspaper Sales\n", "0 230.1 37.8 69.2 22.1\n", "1 44.5 39.3 45.1 10.4\n", "2 17.2 45.9 69.3 9.3\n", "3 151.5 41.3 58.5 18.5\n", "4 180.8 10.8 58.4 12.9" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.subplot(2, 2, 1)\n", "df['TV'].hist(bins = 100)\n", "plt.subplot(2, 2, 2)\n", "df['Newspaper'].hist(bins = 100)\n", "plt.subplot(2, 2, 3)\n", "df['Radio'].hist(bins = 100)\n", "plt.subplot(2, 2, 4)\n", "df['Sales'].hist(bins = 100)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On remarque que les variables 'Radio' et 'TV' sont distribuée d'une façon uniforme, tandis que la probabilité de la variable 'Newspaper' décroit d'une façon exponentielle. La variable 'Sales' semble avoir une distribution gaussienne." ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAET5JREFUeJzt3W+MXFd9xvHvUxPaqhTw4iWNHKeLVAs5itRCVzRS/IKQUiW0qvOiabEqGiFLfhMQqEhtWr8ISLVE31BKhGitOopTUYeoUGEh+idKjSJLgHAooqHbKi4KZJUIm8YlVIhC6K8v9rpd27PZGe/evTPH3480mrln7uz8Nj5+fHPuveekqpAktevHhi5AktQvg16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUuJcNXQDAjh07amFhYegy1Kgnnnji21U1P8R327fVp3H79lQE/cLCAqdPnx66DDUqyTeG+m77tvo0bt926EbS4I4fP85NN93Etm3buOmmmzh+/PjQJTVlKo7oJV29jh8/zqFDhzh69Ch79+7l1KlTHDhwAID9+/cPXF0bPKKXNKjDhw9z9OhRbr31Vq655hpuvfVWjh49yuHDh4curRke0U+xJBN/xmmnNWuWlpbYu3fvRW179+5laWlpoIra4xH9FKuqkY/13pNmyZ49ezh16tRFbadOnWLPnj0DVdQeg17SoA4dOsSBAwc4efIkP/zhDzl58iQHDhzg0KFDQ5fWDIduJA3qwgnXd7/73SwtLbFnzx4OHz7sidhNZNBLGtz+/fsN9h45dCNJjTPoJalxBr2uWkkeSHI2yZOr2uaSPJrkqe55e9eeJB9JcibJV5O8cbjK2+Odsf0y6HU1exC4/ZK2e4HHqmo38Fi3DXAHsLt7HAQ+tkU1Nu/CnbH3338/3//+97n//vs5dOiQYb+JDHpdtarqceD5S5r3Ace618eAO1e1P1QrvgC8Osl1W1Np27wztn8GvXSxa6vqOYDu+bVd+07gmVX7LXdt2iDvjO2fQS+NZ9R8FCNvRU5yMMnpJKfPnTvXc1mzzztj+2fQSxf71oUhme75bNe+DOxatd/1wLOjfkBVHamqxapanJ8fZL2TmeKdsf3zhinpYieAu4EPds+fXtX+riQPA78EfOfCEI82xjtj+2fQ66qV5DjwZmBHkmXgPlYC/pEkB4BvAnd1u38WeBtwBvge8M4tL7hh3hnbL4NeV62qWitZbhuxbwH39FuR1A/H6CWpcQa9JDXOoJekxjlGL2nLTbpMpqunbYxBL2nLrRXcSQz1Hlzx0E2SXUlOJllK8rUk7+naR87+J0kaxkbG6F8E3ldVe4CbgXuS3Mjas/9JkgZwxUFfVc9V1Ze7198FlliZ5Gmt2f8kSQPYlKtukiwAbwC+yNqz/0mSBrDhoE/yCuCTwHur6oUJPucMf5K0BTYU9EmuYSXkP15Vn+qa15r97yLO8CdJW2MjV90EOAosVdWHVr11YfY/uHj2P0nSADZyHf0twDuAf07yla7tD1l79j9J0gCuOOir6hSjV92BEbP/SZKG4Vw3ktQ4g16SGmfQS1LjDHpJapxBPyXm5uZIMtYDGHvfJMzNzQ3820kaktMUT4nz58/3Nj3rpHN/S2qLR/SS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjnOtGGiHJ08B3gR8BL1bVYpI54BPAAvA08JtVdX6oGqVxeUQvre3WqvqFqlrstu8FHquq3cBj3bY09Qx6aXz7gGPd62PAnQPWIo3NoZspUfe9Et7/qv5+tiZVwD8kKeDPq+oIcG1VPQdQVc8lee2oDyY5CBwEuOGGG7aqXmlNBv2UyAde6HU++np/Lz+6ZbdU1bNdmD+a5F/H/WD3j8IRgMXFxX7+UKUJOHQjjVBVz3bPZ4G/Ad4EfCvJdQDd89nhKpTGZ9BLl0jyU0l++sJr4FeAJ4ETwN3dbncDnx6mQmkyDt1Il7sW+JtuCcaXAX9VVX+X5EvAI0kOAN8E7hqwxpkwNzfH+fOTXYE67tKX27dv5/nnn7+Ssq46Br10iar6OvDzI9r/A7ht6yuaXa6FPB02NHST5IEkZ5M8uaptLsmjSZ7qnrdvvExJ0pXa6Bj9g8Dtl7R5U4kkTZENBX1VPQ5cOkjmTSWSNEX6GKMf66YSXa6vMcft2x09k65mg52M9e7Bi01ywipJbye4JLWnj+vox7qppKqOVNViVS3Oz8/3UIYkCfoJem8qkaQpstHLK48Dnwden2S5u5Hkg8BbkzwFvLXbliQNZENj9FW1f423vKlEkqaEc91IUuMMeklqnEEvSY0z6CWpcQa9JDXOaYol9ca1kKeDQS+pN66FPB0cupGkxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zhumJPXKRe+HZ9BL6s2kd8W68H0/HLqRpMYZ9JLUOINekhpn0EtS4wx6aQJJbk/yb0nOJLl36HqkcRj00piSbAM+CtwB3AjsT3LjsFVJ6zPopfG9CThTVV+vqh8ADwP7Bq5JWpdBL41vJ/DMqu3lrk2aat4wNcVe6o7Ctd7zZpNejfqPftl/8CQHgYMAN9xwQ981zaRJ+7b9emM8op9iVTXxQ71aBnat2r4eePbSnarqSFUtVtXi/Pz8lhU3S+zXW8ugl8b3JWB3ktcleTnwduDEwDVJ63LoRhpTVb2Y5F3A3wPbgAeq6msDlyWtK9Pwv0VJzgHfGLqOGbID+PbQRcyQn62qQcZQ7NsTs29PZqy+PRVBr8kkOV1Vi0PXIW02+3Y/HKOXpMYZ9JLUOIN+Nh0ZugCpJ/btHjhGL0mN84hekhpn0EtS4wz6GZLkgSRnkzw5dC3SZrJv98ugny0PArcPXYTUgwexb/fGoJ8hVfU48PzQdUibzb7dL4Nekhpn0EtS4wx6SWqcQS9JjTPoZ0iS48DngdcnWU5yYOiapM1g3+6XUyBIUuM8opekxhn0ktQ4g16SGrfu4uBJfgJ4HPjxbv+/rqr7krwOeBiYA74MvKOqfpDkx4GHgF8E/gP4rap6+qW+Y8eOHbWwsLCR30Na0xNPPPHtodaMtW+rT+P27XWDHvhv4C1V9V9JrgFOJflb4HeBP6mqh5P8GXAA+Fj3fL6qfi7J24E/Bn7rpb5gYWGB06dPj1GKNLkkgy3Obd9Wn8bt2+sO3dSK/+o2r+keBbwF+Ouu/RhwZ/d6X7dN9/5tSTJm3ZKkTTbWGH2SbUm+ApwFHgX+HfjPqnqx22UZ2Nm93gk8A9C9/x3gNSN+5sEkp5OcPnfu3MZ+C0nSmsYK+qr6UVX9AnA98CZgz6jduudRR++XXaxfVUeqarGqFufnBxk+nXpJJn5Is8B+vbXGGaP/P1X1n0k+B9wMvDrJy7qj9uuBZ7vdloFdwHKSlwGvwulHr8haN7MlWfM9aRbYt7fWukf0SeaTvLp7/ZPALwNLwEngN7rd7gY+3b0+0W3Tvf+P5Z+cJA1mnCP664BjSbax8g/DI1X1mST/Ajyc5I+AfwKOdvsfBf4yyRlWjuTf3kPdkqQxrRv0VfVV4A0j2r/Oynj9pe3fB+7alOokSRvmnbGS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g15aQzc99z8l+Uy3/bokX0zyVJJPJHn50DVK4zDopbW9h5UJ/C74Y1ZWVdsNnGdlNTVp6hn00ghJrgd+FfiLbjusvaqaNNUMemm0DwO/B/xPt/0a1l5VTZpqBr10iSS/BpytqidWN4/YdeQ6Cy6TqWlj0EuXuwX49SRPAw+zMmTzYbpV1bp9Vq+qdhGXydS0MeilS1TVH1TV9VW1wMrCOf9YVb/N2quqSVPNoJfG9/vA73arp72G/19VTZpqEy0OLl1tqupzwOe61yNXVZOmnUf0ktQ4g35KzM3NkWSsBzD2vkmYm5sb+LfT1WqSfj1p37Zfj8+hmylx/vx5qkZerbdhF/4CSVvNfj0d1j2iT7IryckkS0m+luQ9Xftckke7eT8eTbK9a0+SjyQ5k+SrSd7Y9y8hSVrbOEM3LwLvq6o9wM3APUluBO4FHuvm/Xis2wa4A9jdPQ4CH9v0qiVJY1s36Kvquar6cvf6u6xM8rQT2MfKfB9w8bwf+4CHasUXWLnJ5LpNr1ySNJaJTsYmWQDeAHwRuLaqnoOVfwyA13a77QSeWfUx5wSRpAGNHfRJXgF8EnhvVb3wUruOaLvsbIzzgUjS1hgr6JNcw0rIf7yqPtU1f+vCkEz3fLZrXwZ2rfr4yDlBnA9EkrbGOFfdhJVbvZeq6kOr3jrBynwfcPG8HyeA3+muvrkZ+M6FIR5J0tYb5zr6W4B3AP+c5Ctd2x8CHwQeSXIA+CZwV/feZ4G3AWeA7wHv3NSKJUkTWTfoq+oUo8fdAW4bsX8B92ywLknSJnEKBElqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNc83YKVH3vRLe/6r+frakq5ZBPyXygRd6XUS53t/Lj5Y0Axy6kaTGGfSS1DiHbiT1xnNP08Ggly6RZBfwEPAzwP8AR6rqT5PMAZ8AFoCngd+sqvND1TkLPPc0HRy6kS73IvC+qtoD3Azck+RG4F7gsaraDTzWbUtTz6CXLlFVz1XVl7vX3wWWgJ3APuBYt9sx4M5hKpQmY9BLLyHJAvAG4IvAtRcWuu+eXztcZdL4DHppDUleAXwSeG9VvTDB5w4mOZ3k9Llz5/orUBqTQS+NkOQaVkL+41X1qa75W0mu696/Djg76rNVdaSqFqtqcX5+fmsKll7CukGf5IEkZ5M8uaptLsmjSZ7qnrd37UnykSRnknw1yRv7LF7qQ5IAR4GlqvrQqrdOAHd3r+8GPr3VtUlXYpwj+geB2y9pW+vqgzuA3d3jIPCxzSlT2lK3AO8A3pLkK93jbcAHgbcmeQp4a7ctTb11r6Ovqse7E1Kr7QPe3L0+BnwO+P2u/aFauXD2C0leneS6CyewpFlQVaeArPH2bVtZi7QZrnSMfq2rD3YCz6zab7lrkyQNZLNPxo46Chp5W5xXJkjS1rjSoF/r6oNlYNeq/a4Hnh31A7wyQZK2xpUG/VpXH5wAfqe7+uZm4DuOz0vSsNY9GZvkOCsnXnckWQbuY+Vqg0eSHAC+CdzV7f5Z4G3AGeB7wDt7qFmSNIFxrrrZv8Zbl1190F1tc89Gi5IkbR7vjJWkxhn0ktQ4g16SGucKU1NkZYqVzbd9+/Zefq6k2WDQT4lJlltL0tvybJLa49CNJDXOoJekxjl0I6lXnnsankEvqTeTnkvy/FM/HLqRpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqXC9Bn+T2JP+W5EySe/v4DmkI9m3Nok0P+iTbgI8CdwA3AvuT3LjZ3yNtNfu2ZlUfR/RvAs5U1der6gfAw8C+Hr5H2mr2bc2kPoJ+J/DMqu3lrk0TSjLysd576s1YfTvJwSSnk5w+d+7clhU3Sybt29qYPoJ+1J/KZUvG+JdhfVU18UO9GqtvV9WRqlqsqsX5+fktKGv22K+3Vh9BvwzsWrV9PfDspTv5l0EzaKy+LU2bPoL+S8DuJK9L8nLg7cCJHr5H2mr2bc2kTV8cvKpeTPIu4O+BbcADVfW1zf4eaavZtzWrMg3jX0nOAd8Yuo4ZsgP49tBFzJCfrapBxgft2xOzb09mrL49FUGvySQ5XVWLQ9chbTb7dj+cAkGSGmfQS1LjDPrZdGToAqSe2Ld74Bi9JDXOI3pJapxBP0OSPJDkbJInh65F2kz27X4Z9LPlQeD2oYuQevAg9u3eGPQzpKoeB54fug5ps9m3+2XQS1LjDHpJapxBL0mNM+glqXEG/QxJchz4PPD6JMtJDgxdk7QZ7Nv98s5YSWqcR/SS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxv0vLlr8RD7wJsUAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.subplot(2, 2, 1)\n", "plt.boxplot(df['Sales'])\n", "plt.subplot(2, 2, 2)\n", "plt.boxplot(df['Newspaper'])\n", "plt.subplot(2, 2, 3)\n", "plt.boxplot(df['TV'])\n", "plt.subplot(2, 2, 4)\n", "plt.boxplot(df['Radio'])\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On remarque qu'il y a deux valeurs aberrantes dans la colonne 'Newspaper'\n", "Il n'ya pas de valeurs manquantes dans le dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Corrélations entre les variables" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import seaborn as sns\n", "import numpy as np\n", "correlations = df.corr()\n", "\n", "# Generate a mask for the upper triangle\n", "mask = np.zeros_like(correlations, dtype=np.bool)\n", "mask[np.triu_indices_from(mask)] = True\n", "\n", "# Set up the matplotlib figure\n", "f, ax = plt.subplots(figsize=(5, 4))\n", " \n", "# Generate a custom diverging colormap\n", "cmap = sns.diverging_palette(220, 10, as_cmap=True)\n", "\n", "# Draw the heatmap with the mask and correct aspect ratio\n", "sns.heatmap(correlations, mask=mask, cmap=cmap, vmax=.3, center=0,\n", " square=True, linewidths=.5, cbar_kws={\"shrink\": .5})\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On remarque qu'il y a une corrélation entre va variable 'Newspaper' et 'Radio' de 0.3. on remarque aussi que la variable 'Newspaper' est la moins correlée avec la variable 'Sales'" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.subplot(2, 3, 1)\n", "plt.plot(df['Newspaper'], df['Sales'], 'o', color='red');\n", "plt.xlabel('Newspaper')\n", "plt.ylabel('Sales')\n", "\n", "plt.subplot(2, 3, 2)\n", "plt.plot(df['TV'], df['Sales'], 'o', color='blue');\n", "plt.xlabel('TV')\n", "plt.ylabel('Sales')\n", "\n", "plt.subplot(2, 3, 3)\n", "plt.plot(df['Radio'], df['Sales'], 'o', color='brown');\n", "plt.xlabel('Radio')\n", "plt.ylabel('Sales')\n", "\n", "plt.subplot(2, 3, 4)\n", "plt.plot(df['Radio'], df['Newspaper'], 'o', color='black');\n", "plt.xlabel('Radio')\n", "plt.ylabel('Newspaper')\n", "\n", "plt.subplot(2, 3, 5)\n", "plt.plot(df['TV'], df['Newspaper'], 'o', color='magenta');\n", "plt.xlabel('TV')\n", "plt.ylabel('Newspaper')\n", "\n", "plt.subplot(2, 3, 6)\n", "plt.plot(df['TV'], df['Radio'], 'o', color='green');\n", "plt.xlabel('TV')\n", "plt.ylabel('Radio')\n", "\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On observe qu'il y a une corrélation entre la variables 'Sales' et les deux variables 'Radio' et 'TV'\n", "On observe aussi qu'il y a une relation presque quadratique entre la variable 'Sales' et 'TV'. Donc peut être ça serait mieux d'jouter un coefficient pour la variable 'TV'^2 en modèle de regression linéaire" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modeles univariables" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import statsmodels.formula.api as smf\n", "lmTV = smf.ols(formula='Sales ~ TV ', data=df).fit()\n", "lmRadio = smf.ols(formula='Sales ~ Radio ', data=df).fit()\n", "lmNewspaper = smf.ols(formula='Sales ~ Newspaper ', data=df).fit()" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: Sales R-squared: 0.612\n", "Model: OLS Adj. R-squared: 0.610\n", "Method: Least Squares F-statistic: 312.1\n", "Date: Wed, 26 Sep 2018 Prob (F-statistic): 1.47e-42\n", "Time: 15:32:09 Log-Likelihood: -519.05\n", "No. Observations: 200 AIC: 1042.\n", "Df Residuals: 198 BIC: 1049.\n", "Df Model: 1 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "Intercept 7.0326 0.458 15.360 0.000 6.130 7.935\n", "TV 0.0475 0.003 17.668 0.000 0.042 0.053\n", "==============================================================================\n", "Omnibus: 0.531 Durbin-Watson: 1.935\n", "Prob(Omnibus): 0.767 Jarque-Bera (JB): 0.669\n", "Skew: -0.089 Prob(JB): 0.716\n", "Kurtosis: 2.779 Cond. No. 338.\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] } ], "source": [ "print(lmTV.summary())" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "collapsed": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: Sales R-squared: 0.332\n", "Model: OLS Adj. R-squared: 0.329\n", "Method: Least Squares F-statistic: 98.42\n", "Date: Wed, 26 Sep 2018 Prob (F-statistic): 4.35e-19\n", "Time: 15:32:14 Log-Likelihood: -573.34\n", "No. Observations: 200 AIC: 1151.\n", "Df Residuals: 198 BIC: 1157.\n", "Df Model: 1 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "Intercept 9.3116 0.563 16.542 0.000 8.202 10.422\n", "Radio 0.2025 0.020 9.921 0.000 0.162 0.243\n", "==============================================================================\n", "Omnibus: 19.358 Durbin-Watson: 1.946\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 21.910\n", "Skew: -0.764 Prob(JB): 1.75e-05\n", "Kurtosis: 3.544 Cond. No. 51.4\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] } ], "source": [ "print(lmRadio.summary())" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: Sales R-squared: 0.052\n", "Model: OLS Adj. R-squared: 0.047\n", "Method: Least Squares F-statistic: 10.89\n", "Date: Wed, 26 Sep 2018 Prob (F-statistic): 0.00115\n", "Time: 15:32:16 Log-Likelihood: -608.34\n", "No. Observations: 200 AIC: 1221.\n", "Df Residuals: 198 BIC: 1227.\n", "Df Model: 1 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "Intercept 12.3514 0.621 19.876 0.000 11.126 13.577\n", "Newspaper 0.0547 0.017 3.300 0.001 0.022 0.087\n", "==============================================================================\n", "Omnibus: 6.231 Durbin-Watson: 1.983\n", "Prob(Omnibus): 0.044 Jarque-Bera (JB): 5.483\n", "Skew: 0.330 Prob(JB): 0.0645\n", "Kurtosis: 2.527 Cond. No. 64.7\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] } ], "source": [ "print(lmNewspaper.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- D'après les valeurs de R^2, des pvalues et de la MSE, on remarque qu'il n'y a pas de relation significante entre la variable 'Newspaper' et la variable 'Sales'. \n", "- Le meilleur modèle est celui de la variable 'Sales' en fonction de la variable 'TV'. Alors TV est plus corrélé à Sales que Radio parsque le publicité audiovisuelles sont plus captives que les publicité par radio.\n", "- Pour que le coefficient de la regression linéaire refléte l'importance de la variable par rapport aux autres, on doit normaliser les variables\n", "- Dans le modele Sales ~ TV, l'intercept représente le nombre d'unité de Sales si le budget TV est nul, qui est entre 6 et 8 dans une 95% d'intervale" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Modele multi variables" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: Sales R-squared: 0.897\n", "Model: OLS Adj. R-squared: 0.896\n", "Method: Least Squares F-statistic: 570.3\n", "Date: Wed, 26 Sep 2018 Prob (F-statistic): 1.58e-96\n", "Time: 15:36:14 Log-Likelihood: -386.18\n", "No. Observations: 200 AIC: 780.4\n", "Df Residuals: 196 BIC: 793.6\n", "Df Model: 3 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "Intercept 2.9389 0.312 9.422 0.000 2.324 3.554\n", "Radio 0.1885 0.009 21.893 0.000 0.172 0.206\n", "TV 0.0458 0.001 32.809 0.000 0.043 0.049\n", "Newspaper -0.0010 0.006 -0.177 0.860 -0.013 0.011\n", "==============================================================================\n", "Omnibus: 60.414 Durbin-Watson: 2.084\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 151.241\n", "Skew: -1.327 Prob(JB): 1.44e-33\n", "Kurtosis: 6.332 Cond. No. 454.\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] } ], "source": [ "lmMultiVariables = smf.ols(formula='Sales ~ Radio + TV + Newspaper ', data=df).fit()\n", "print(lmMultiVariables.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- La première remarque à propos de ce modèle, est que la pvalue de la variable 'Newspaper' est très grande (0.86) avec un coefficient qui est très petit, alors ce n'est pas la peine de laisser cette variable dans le modèle. On observe aussi que la valeur de R-squared est très élevée (0.897), ce montre que notre modèle performe bien\n", "- Si on augmente de 50 les sommes allouées au média TV, les ventes augmentent de 0.229\n", "- Le coefficient pour Newspaper est presque nul, légérement négatif, dans le modèle complet tandis qu'il est positif lorsque pris en compte individuellement parseque dans le premier modèle, on n'avait pas de coefficients significatifs à part de celui de l'intercept qui affectaient les résulats. Donc je pense que le fait le coefficient est négatif est juste par hasard, c'est à dire, qu'un simple de changement dans la valeurs de la base de données ou des coefficient des autre variable peut résulter en un coefficient positif" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: Sales R-squared: 0.897\n", "Model: OLS Adj. R-squared: 0.896\n", "Method: Least Squares F-statistic: 859.6\n", "Date: Wed, 26 Sep 2018 Prob (F-statistic): 4.83e-98\n", "Time: 15:40:54 Log-Likelihood: -386.20\n", "No. Observations: 200 AIC: 778.4\n", "Df Residuals: 197 BIC: 788.3\n", "Df Model: 2 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "Intercept 2.9211 0.294 9.919 0.000 2.340 3.502\n", "Radio 0.1880 0.008 23.382 0.000 0.172 0.204\n", "TV 0.0458 0.001 32.909 0.000 0.043 0.048\n", "==============================================================================\n", "Omnibus: 60.022 Durbin-Watson: 2.081\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 148.679\n", "Skew: -1.323 Prob(JB): 5.19e-33\n", "Kurtosis: 6.292 Cond. No. 425.\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] } ], "source": [ "lmMultiVariables = smf.ols(formula='Sales ~ Radio + TV', data=df).fit()\n", "print(lmMultiVariables.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On remarque que les valeurs de R-squared, Adj. R-squared et MSE n'ont pas changé après avoir enlevé la variable 'Newspaper', mais par contre le modèle est plus simple à interpréter" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "Modèle multiplicatif :" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: Sales R-squared: 0.968\n", "Model: OLS Adj. R-squared: 0.967\n", "Method: Least Squares F-statistic: 1963.\n", "Date: Wed, 26 Sep 2018 Prob (F-statistic): 6.68e-146\n", "Time: 15:53:44 Log-Likelihood: -270.14\n", "No. Observations: 200 AIC: 548.3\n", "Df Residuals: 196 BIC: 561.5\n", "Df Model: 3 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "Intercept 6.7502 0.248 27.233 0.000 6.261 7.239\n", "Radio 0.0289 0.009 3.241 0.001 0.011 0.046\n", "TV 0.0191 0.002 12.699 0.000 0.016 0.022\n", "TV:Radio 0.0011 5.24e-05 20.727 0.000 0.001 0.001\n", "==============================================================================\n", "Omnibus: 128.132 Durbin-Watson: 2.224\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 1183.719\n", "Skew: -2.323 Prob(JB): 9.09e-258\n", "Kurtosis: 13.975 Cond. No. 1.80e+04\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "[2] The condition number is large, 1.8e+04. This might indicate that there are\n", "strong multicollinearity or other numerical problems.\n" ] } ], "source": [ "lmMultiVariables = smf.ols(formula='Sales ~ Radio + TV + TV*Radio', data=df).fit()\n", "print(lmMultiVariables.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On remarque que l'ajout de la variable multiplicative TV * Radio a produit un modèle plus performant que l'ancien. On remarque aussi que le 'std err' du coefficient de cette nouvelle variable est très petit, ce qui montre la forte influence de cette variable sur le modèle. On peut interpréter cela par le fait qu'une publicité efficace, on doit intégrer le TV et la Radio en même temps, et ne pas négliger une par rapport à l'autre" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: Sales R-squared: 0.968\n", "Model: OLS Adj. R-squared: 0.967\n", "Method: Least Squares F-statistic: 1963.\n", "Date: Wed, 26 Sep 2018 Prob (F-statistic): 6.68e-146\n", "Time: 15:54:16 Log-Likelihood: -270.14\n", "No. Observations: 200 AIC: 548.3\n", "Df Residuals: 196 BIC: 561.5\n", "Df Model: 3 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "Intercept 6.7502 0.248 27.233 0.000 6.261 7.239\n", "Radio 0.0289 0.009 3.241 0.001 0.011 0.046\n", "TV 0.0191 0.002 12.699 0.000 0.016 0.022\n", "TV:Radio 0.0011 5.24e-05 20.727 0.000 0.001 0.001\n", "==============================================================================\n", "Omnibus: 128.132 Durbin-Watson: 2.224\n", "Prob(Omnibus): 0.000 Jarque-Bera (JB): 1183.719\n", "Skew: -2.323 Prob(JB): 9.09e-258\n", "Kurtosis: 13.975 Cond. No. 1.80e+04\n", "==============================================================================\n", "\n", "Warnings:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", "[2] The condition number is large, 1.8e+04. This might indicate that there are\n", "strong multicollinearity or other numerical problems.\n" ] } ], "source": [ "lmMultiVariables = smf.ols(formula='Sales ~ Radio + TV + TV*Radio + TV*TV ', data=df).fit()\n", "print(lmMultiVariables.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On ne remarque pas une amélioration dans la précision du modèle avec l'ajout de la variable 'TV'^2, donc ce n'est pas la peine d'ajouter cette variable au modèle et le rendre plus compliqué" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }