{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Семинар 5: проверка статистических гипотез" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Загрузим таблицу с оценками студентов по разным курсам из csv-файла по ссылке (ФИО студентов зашифрованы):" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"http://math-info.hse.ru/f/2017-18/py-prog/scores2.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Посмотрим на нее:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Удалим строки с пропущенными значениями:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = df.dropna()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Выведем общую информацию по датафрейму:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 54 entries, 0 to 57\n", "Data columns (total 18 columns):\n", "id 54 non-null object\n", "catps 54 non-null int64\n", "mstat 54 non-null int64\n", "soc 54 non-null int64\n", "econ 54 non-null int64\n", "eng 54 non-null int64\n", "polth 54 non-null int64\n", "mstat2 54 non-null int64\n", "phist 54 non-null float64\n", "law 54 non-null int64\n", "phil 54 non-null int64\n", "polsoc 54 non-null int64\n", "ptheo 54 non-null float64\n", "preg 54 non-null int64\n", "compp 54 non-null float64\n", "game 54 non-null int64\n", "wpol 54 non-null int64\n", "male 54 non-null int64\n", "dtypes: float64(3), int64(14), object(1)\n", "memory usage: 8.0+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Выведем описательные статистики по всем количественным показателям в датафрейме:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
catpsmstatsoceconengpolthmstat2phistlawphilpolsocptheopregcomppgamewpolmale
count54.00000054.00000054.00000054.00000054.00000054.00000054.00000054.00000054.00000054.00000054.00000054.00000054.00000054.00000054.00000054.00000054.000000
mean6.8148157.5925937.2407416.2592598.3888896.7222227.1481485.8333336.9629636.0740747.3703705.6851856.8148155.6481486.3703707.7222220.425926
std1.3882811.4986601.1148261.7176680.9598871.6301021.6868801.6106951.1649431.8617491.4314041.4118641.3328091.3894131.8045681.3516820.499126
min5.0000005.0000005.0000004.0000006.0000004.0000004.0000004.0000005.0000004.0000004.0000004.0000004.0000004.0000004.0000004.0000000.000000
25%6.0000006.2500006.0000005.0000008.0000006.0000006.0000004.0000006.0000005.0000006.0000005.0000006.0000005.0000005.0000007.0000000.000000
50%7.0000007.5000007.0000006.0000008.5000006.0000007.0000006.0000007.0000006.0000008.0000005.0000007.0000005.0000006.0000008.0000000.000000
75%7.7500009.0000008.0000007.0000009.0000008.0000008.7500007.0000008.0000007.0000008.0000006.7500008.0000006.7500008.0000009.0000001.000000
max10.00000010.00000010.00000010.00000010.00000010.00000010.0000009.0000009.00000010.00000010.0000009.0000008.0000008.00000010.00000010.0000001.000000
\n", "
" ], "text/plain": [ " catps mstat soc econ eng polth \\\n", "count 54.000000 54.000000 54.000000 54.000000 54.000000 54.000000 \n", "mean 6.814815 7.592593 7.240741 6.259259 8.388889 6.722222 \n", "std 1.388281 1.498660 1.114826 1.717668 0.959887 1.630102 \n", "min 5.000000 5.000000 5.000000 4.000000 6.000000 4.000000 \n", "25% 6.000000 6.250000 6.000000 5.000000 8.000000 6.000000 \n", "50% 7.000000 7.500000 7.000000 6.000000 8.500000 6.000000 \n", "75% 7.750000 9.000000 8.000000 7.000000 9.000000 8.000000 \n", "max 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 \n", "\n", " mstat2 phist law phil polsoc ptheo \\\n", "count 54.000000 54.000000 54.000000 54.000000 54.000000 54.000000 \n", "mean 7.148148 5.833333 6.962963 6.074074 7.370370 5.685185 \n", "std 1.686880 1.610695 1.164943 1.861749 1.431404 1.411864 \n", "min 4.000000 4.000000 5.000000 4.000000 4.000000 4.000000 \n", "25% 6.000000 4.000000 6.000000 5.000000 6.000000 5.000000 \n", "50% 7.000000 6.000000 7.000000 6.000000 8.000000 5.000000 \n", "75% 8.750000 7.000000 8.000000 7.000000 8.000000 6.750000 \n", "max 10.000000 9.000000 9.000000 10.000000 10.000000 9.000000 \n", "\n", " preg compp game wpol male \n", "count 54.000000 54.000000 54.000000 54.000000 54.000000 \n", "mean 6.814815 5.648148 6.370370 7.722222 0.425926 \n", "std 1.332809 1.389413 1.804568 1.351682 0.499126 \n", "min 4.000000 4.000000 4.000000 4.000000 0.000000 \n", "25% 6.000000 5.000000 5.000000 7.000000 0.000000 \n", "50% 7.000000 5.000000 6.000000 8.000000 0.000000 \n", "75% 8.000000 6.750000 8.000000 9.000000 1.000000 \n", "max 8.000000 8.000000 10.000000 10.000000 1.000000 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Попроверяем гипотезы на основе имеющихся данных – воспользуемся библиотекой `scipy`." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from scipy import stats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Гипотеза о равенстве среднего числу** " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Проверим гипотезу о равенстве средней оценки студентов по экономике 6 на 5%-ном уровне значимости.\n", "\n", "$H_0: \\mu_{econ} = 6$. \n", "\n", "Альтернатива в Python всегда двусторонняя (по умолчанию), то есть:\n", "\n", "$H_1: \\mu_{econ} \\ne 6$." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Давайте исходить из предположения, что выборка оценок по экономике взята из нормального распределения. Строго говоря, это не так, потому что шкала оценок – порядковая, и оценки принимают всего 10 значений (по факту, даже меньше, потому что оценки взяты из рейтинга после пересдач и оценок ниже 4 в базе нет). Давайте посмотрим на гистограмму для оценок по экономике: импортируем модуль `pyplot` из библиотеки для графики `matplotlib` и скажем Python выводить графики прямо в файле `.ipynb`, а не в отдельном окне (`% matplotlib inline`:)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "% matplotlib inline" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAD8CAYAAAB6paOMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAEPdJREFUeJzt3XusZXV5xvHvI6MRLBaV46XA6QGDqCVe8GisVFQQg6KgpraQ2lBvY1rrrW10qG3xnybYUpXGpjpFBG8YxRsVL4xaJU0UHS4WEC1WRxxBZyi1eEf07R97Tx3GGWadPWetNef8vp/kZK+19tr7fVc28PBb11QVkqR23WXsBiRJ4zIIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY1bM3YDXRx44IG1sLAwdhuStKJcfvnlN1fV3O7WWxFBsLCwwMaNG8duQ5JWlCTf7LKeu4YkqXEGgSQ1ziCQpMYZBJLUOINAkhrXWxAkOTfJliTX7LD8pUm+muTaJH/XV31JUjd9jgjOA07YfkGSJwEnAw+rqt8CzuqxviSpg96CoKouBW7ZYfEfA2dW1U+n62zpq74kqZuhjxE8CHh8ksuSfDbJoweuL0nawdBXFq8B7gU8Fng08N4kh1VV7bhikrXAWoD5+flBm9RsFtZdPFrtTWeeOFptaaUbekSwGfhATXwB+AVw4M5WrKr1VbVYVYtzc7u9VYYkaUZDB8GHgGMBkjwIuBtw88A9SJK209uuoSQXAE8EDkyyGTgDOBc4d3pK6W3AaTvbLSRJGk5vQVBVp+7iref2VVOStHReWSxJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmN6y0IkpybZMv0sZQ7vvcXSSrJTh9cL0kaTp8jgvOAE3ZcmOQQ4Hjghh5rS5I66i0IqupS4JadvPUG4FWAD62XpL3AoMcIkpwEfLuqvjRkXUnSrq0ZqlCS/YDXAE/puP5aYC3A/Px8j51JUtuGHBE8EDgU+FKSTcDBwBVJ7r+zlatqfVUtVtXi3NzcgG1KUlsGGxFU1dXAfbfNT8NgsapuHqoHSdKv6vP00QuAzwFHJNmc5AV91ZIkza63EUFVnbqb9xf6qi1J6s4riyWpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxfT6q8twkW5Jcs92yv0/ylST/keSDSQ7oq74kqZs+RwTnASfssGwDcGRVPQz4T+D0HutLkjroLQiq6lLglh2WXVJVt09nPw8c3Fd9SVI3Yx4jeD7wsV29mWRtko1JNm7dunXAtiSpLaMEQZLXALcD79rVOlW1vqoWq2pxbm5uuOYkqTFrhi6Y5DTg6cBxVVVD15ck3dGgQZDkBODVwBOq6kdD1pYk7Vyfp49eAHwOOCLJ5iQvAN4E7A9sSHJVkjf3VV+S1E1vI4KqOnUni9/aVz1J0my8sliSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYNfouJoS2su3i02pvOPHG02pLUlSMCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuMMAklqXKcgSHLkUr84yblJtiS5Zrtl906yIcn109d7LfV7JUnLq+uI4M1JvpDkT5Ic0PEz5wEn7LBsHfCpqjoc+NR0XpI0ok5BUFW/A/wBcAiwMcm7kxy/m89cCtyyw+KTgfOn0+cDz1xau5Kk5db5GEFVXQ/8FfBq4AnAPyb5SpJnL6He/arqpun33QTcd1crJlmbZGOSjVu3bl1CCUnSUnQ9RvCwJG8ArgOOBZ5RVQ+ZTr+hj8aqan1VLVbV4tzcXB8lJEl0HxG8CbgCeHhVvaSqrgCoqhuZjBK6+m6SBwBMX7cspVlJ0vLrGgRPA95dVT8GSHKXJPsBVNU7llDvIuC06fRpwIeX8FlJUg+6BsEngX23m99vumyXklwAfA44IsnmJC8AzgSOT3I9cPx0XpI0oq5PKLt7Vf1g20xV/WDbiGBXqurUXbx1XNfmJEn96zoi+GGSo7bNJHkU8ON+WpIkDanriOAVwPuS3DidfwDw+/20JEkaUqcgqKovJnkwcAQQ4CtV9bNeO5MkDaLriADg0cDC9DOPTEJVvb2XriRJg+kUBEneATwQuAr4+XRxAQaBJK1wXUcEi8BDq6r6bEaSNLyuZw1dA9y/z0YkSePoOiI4EPhyki8AP922sKpO6qUraYVYWHfxaLU3nXniaLW1unQNgtf22YQkaTxdTx/9bJLfBA6vqk9Oryrep9/WJElD6Hob6hcBFwJvmS46CPhQX01JkobT9WDxS4CjgVvh/x9Ss8uHykiSVo6uQfDTqrpt20ySNUyuI5AkrXBdg+CzSf4S2Hf6rOL3Af/aX1uSpKF0DYJ1wFbgauDFwEdZ2pPJJEl7qa5nDf0C+JfpnyRpFel6r6FvsJNjAlV12LJ3JEka1FLuNbTN3YHnAPeetWiSVwIvZBIuVwPPq6qfzPp9kqTZdTpGUFX/vd3ft6vqjcCxsxRMchDwMmCxqo5kcmHaKbN8lyRpz3XdNXTUdrN3YTJC2H8P6+6b5GfAfsCNu1lfktSTrruG/mG76duBTcDvzVKwqr6d5CzgBibPPb6kqi7Zcb0ka4G1APPz87OUkiR10PWsoSctV8Ek9wJOBg4FvsfkWcjPrap37lBzPbAeYHFx0YvXJKknXXcN/dmdvV9Vr19CzScD36iqrdPv/gDwOOCdd/opSVIvlnLW0KOBi6bzzwAuBb41Q80bgMdO72D6Y+A4YOMM3yNJWgZLeTDNUVX1fYAkrwXeV1UvXGrBqrosyYXAFUyON1zJdBeQJGl4XYNgHrhtu/nbgIVZi1bVGcAZs35ekrR8ugbBO4AvJPkgk4vAngW8vbeuJEmD6XrW0N8m+Rjw+Omi51XVlf21JUkaSte7j8Lkwq9bq+psYHOSQ3vqSZI0oK6PqjwDeDVw+nTRXfF0T0laFbqOCJ4FnAT8EKCqbmTPbjEhSdpLdA2C26qqmN6KOsk9+mtJkjSkrkHw3iRvAQ5I8iLgk/iQGklaFbqeNXTW9FnFtwJHAH9TVRt67UzSnVpYd/EodTedeeIoddWf3QZBkn2AT1TVkwH/4y9Jq8xudw1V1c+BHyX59QH6kSQNrOuVxT8Brk6ygemZQwBV9bJeupIkDaZrEFw8/ZMkrTJ3GgRJ5qvqhqo6f6iGJEnD2t0xgg9tm0jy/p57kSSNYHdBkO2mD+uzEUnSOHYXBLWLaUnSKrG7g8UPT3Irk5HBvtNppvNVVfecpWiSA4BzgCOZBMzzq+pzs3yXJGnP3GkQVNU+PdU9G/h4Vf1ukrsxucW1JGkEXU8fXTZJ7gkcA/wRQFXdxh0fgylJGtBSHkyzXA4DtgJvS3JlknO8m6kkjWfwEcG05lHAS6vqsiRnA+uAv95+pSRrgbUA8/Pzgze5HLwpmKSVYIwRwWZgc1VdNp2/kEkw3EFVra+qxapanJubG7RBSWrJ4EFQVd8BvpXkiOmi44AvD92HJGlijF1DAC8F3jU9Y+jrwPNG6kOSmjdKEFTVVcDiGLUlSXc0xjECSdJexCCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDVutCBIsk+SK5N8ZKweJEnjjgheDlw3Yn1JEiMFQZKDgROBc8aoL0n6pbFGBG8EXgX8YqT6kqSpNUMXTPJ0YEtVXZ7kiXey3lpgLcD8/PxA3UnSr1pYd/FotTedeWLvNcYYERwNnJRkE/Ae4Ngk79xxpapaX1WLVbU4Nzc3dI+S1IzBg6CqTq+qg6tqATgF+HRVPXfoPiRJE15HIEmNG/wYwfaq6jPAZ8bsQZJa54hAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGjd4ECQ5JMm/JbkuybVJXj50D5KkXxrjUZW3A39eVVck2R+4PMmGqvryCL1IUvMGHxFU1U1VdcV0+vvAdcBBQ/chSZoY9RhBkgXgkcBlY/YhSS0bLQiS/BrwfuAVVXXrTt5fm2Rjko1bt24dvkFJasQoQZDkrkxC4F1V9YGdrVNV66tqsaoW5+bmhm1QkhoyxllDAd4KXFdVrx+6viTpjsYYERwN/CFwbJKrpn9PG6EPSRIjnD5aVf8OZOi6kqSd88piSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuPGeB6BJM1kYd3FY7ewKjkikKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDVurIfXn5Dkq0m+lmTdGD1IkibGeHj9PsA/AU8FHgqcmuShQ/chSZoYY0TwGOBrVfX1qroNeA9w8gh9SJIYJwgOAr613fzm6TJJ0gjGuOlcdrKsfmWlZC2wdjr7gyRfnbHegcDNM352b9NpW/K6ATrZc8v6u4y4zf7ztXdaNb9LXrdH2/KbXVYaIwg2A4dsN38wcOOOK1XVemD9nhZLsrGqFvf0e/YGbsveZ7VsB7gte6shtmWMXUNfBA5PcmiSuwGnABeN0IckiRFGBFV1e5I/BT4B7AOcW1XXDt2HJGlilAfTVNVHgY8OVG6Pdy/tRdyWvc9q2Q5wW/ZWvW9Lqn7lOK0kqSHeYkKSGreqgyDJPkmuTPKRsXvZU0k2Jbk6yVVJNo7dz6ySHJDkwiRfSXJdkt8eu6dZJDli+lts+7s1ySvG7mtWSV6Z5Nok1yS5IMndx+5pFklePt2Ga1fa75Hk3CRbklyz3bJ7J9mQ5Prp6736qL2qgwB4OXDd2E0soydV1SNW+GlxZwMfr6oHAw9nhf4+VfXV6W/xCOBRwI+AD47c1kySHAS8DFisqiOZnMRxyrhdLV2SI4EXMbl7wcOBpyc5fNyuluQ84IQdlq0DPlVVhwOfms4vu1UbBEkOBk4Ezhm7F00kuSdwDPBWgKq6raq+N25Xy+I44L+q6ptjN7IH1gD7JlkD7MdOru1ZAR4CfL6qflRVtwOfBZ41ck+dVdWlwC07LD4ZOH86fT7wzD5qr9ogAN4IvAr4xdiNLJMCLkly+fSq65XoMGAr8LbpLrtzktxj7KaWwSnABWM3Mauq+jZwFnADcBPwv1V1ybhdzeQa4Jgk90myH/A07njx6kp0v6q6CWD6et8+iqzKIEjydGBLVV0+di/L6OiqOorJXVtfkuSYsRuawRrgKOCfq+qRwA/paag7lOlFkScB7xu7l1lN9zufDBwK/AZwjyTPHberpauq64DXARuAjwNfAm4ftakVYlUGAXA0cFKSTUzubnpskneO29Keqaobp69bmOyLfsy4Hc1kM7C5qi6bzl/IJBhWsqcCV1TVd8duZA88GfhGVW2tqp8BHwAeN3JPM6mqt1bVUVV1DJPdLNeP3dMe+m6SBwBMX7f0UWRVBkFVnV5VB1fVApNh+6erasX9H842Se6RZP9t08BTmAyDV5Sq+g7wrSRHTBcdB3x5xJaWw6ms4N1CUzcAj02yX5Iw+V1W5EH8JPedvs4Dz2bl/zYXAadNp08DPtxHkVGuLNaS3Q/44OTfUdYA766qj4/b0sxeCrxrukvl68DzRu5nZtP90McDLx67lz1RVZcluRC4gsmulCtZuVfmvj/JfYCfAS+pqv8Zu6GuklwAPBE4MMlm4AzgTOC9SV7AJLCf00ttryyWpLatyl1DkqTuDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhr3f+83O/Gre49eAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# hist - гистограмма\n", "df['econ'].plot('hist')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Если мы все же предположим, что распределение нормальное (просто выборка маленькая и по ней незаметно), мы сможем использовать критерий Стьюдента для одной выборки (*1-sample t-test*):" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Ttest_1sampResult(statistic=1.1091540214492386, pvalue=0.272371318024516)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats.ttest_1samp(df['econ'], 6) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Уровень значимости мы приняли равным 5%, значит, $\\alpha = 0.05$. \n", "\n", "p-value > $\\alpha$, следовательно, на имеющихся данных на 5% уровне значимости нет оснований отвергнуть нулевую гипотезу в пользу альтернативы (это статистический вывод). Средняя оценка студентов по экономике равна 6 (это содержательный вывод)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Если бы мы проверяли гипотезу о равенстве средней оценке по экномике 5, ситуация была бы обратной, и нулевую гипотезу следовало бы отвергнуть:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Ttest_1sampResult(statistic=5.38731953275344, pvalue=1.674626984992714e-06)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats.ttest_1samp(df['econ'], 5) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Значит, разница всего в один балл является существенной для наших данных (несложно угадать, почему, если вспомнить, что 5 баллов – это «удовлетворительно», а 6 баллов – уже «хорошо»)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Гипотеза о равенстве средних**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Сравним средние оценки юношей и девушек, сгруппировав строки по показателю `male`:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
catpsmstatsoceconengpolthmstat2phistlawphilpolsocptheopregcomppgamewpol
male
06.4193557.3548397.1612906.0000008.3870976.2580657.0645165.4516136.8387105.7419357.0645165.4193557.1935485.5483876.3225817.612903
17.3478267.9130437.3478266.6086968.3913047.3478267.2608706.3478267.1304356.5217397.7826096.0434786.3043485.7826096.4347837.869565
\n", "
" ], "text/plain": [ " catps mstat soc econ eng polth mstat2 \\\n", "male \n", "0 6.419355 7.354839 7.161290 6.000000 8.387097 6.258065 7.064516 \n", "1 7.347826 7.913043 7.347826 6.608696 8.391304 7.347826 7.260870 \n", "\n", " phist law phil polsoc ptheo preg compp \\\n", "male \n", "0 5.451613 6.838710 5.741935 7.064516 5.419355 7.193548 5.548387 \n", "1 6.347826 7.130435 6.521739 7.782609 6.043478 6.304348 5.782609 \n", "\n", " game wpol \n", "male \n", "0 6.322581 7.612903 \n", "1 6.434783 7.869565 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "groups = df.groupby('male')\n", "groups.mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Сохраним оценки по теории игр (`game`), сгруппированные по полу, в переменные `male_game` и `female_game`." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "male_game = df[df['male'] == 1]['game']\n", "female_game = df[df['male'] == 0]['game']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Проверим на 10%-ном уровне значимости гипотезу о равенстве средних оценок по теории игр у юношей и девушек (опять же в предположении о том, что обе выборки взяты из нормального распределения):\n", "\n", "$H_0: \\mu_{male} = \\mu_{female}$\n", "\n", "$H_1: \\mu_{male} \\ne \\mu_{female}$ (альтернативы в Python всегда двусторонние)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Ttest_indResult(statistic=-0.22389694844949543, pvalue=0.8237148426143586)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# ind - потому что две выборки независимы\n", "stats.ttest_ind(female_game, male_game)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "На имеющихся данных, на уровне значимости 10% (и на меньшем) нет оснований отвергнуть нулевую гипотезу. Средние оценки по теории игр у юношей и девушек можно считать равными." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Проделам то же с оценками по политической истории (`phist`):" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "male_phist = df[df['male'] == 1]['phist']\n", "female_phist = df[df['male'] == 0]['phist']" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Ttest_indResult(statistic=-2.0846778601989686, pvalue=0.04202658789268412)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats.ttest_ind(female_phist, male_phist)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Тут ситуация уже менее однозначная – все зависит от уровня значимости (подумайте, при каких уровнях значимости нулевая гипотеза будет отвергаться, а при каких – нет)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Точно так же можем сравнить оценки по разным курсам (в разных столбцах, например, по разным частям «Математики и статистики»)." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Ttest_indResult(statistic=1.4474034225404007, pvalue=0.15073504510027344)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats.ttest_ind(df.mstat, df.mstat2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Гипотеза о равенстве нулю коэффициента корреляции**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Немного визуализации" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from pandas.tools import plotting" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Матрица диаграмм рассеяния для выбранных столбцов (как *scattermatrix* в R):" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: 'pandas.tools.plotting.scatter_matrix' is deprecated, import 'pandas.plotting.scatter_matrix' instead.\n", " This is separate from the ipykernel package so we can avoid doing imports until\n" ] }, { "data": { "text/plain": [ "array([[,\n", " ,\n", " ,\n", " ],\n", " [,\n", " ,\n", " ,\n", " ],\n", " [,\n", " ,\n", " ,\n", " ],\n", " [,\n", " ,\n", " ,\n", " ]],\n", " dtype=object)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "% matplotlib inline\n", "\n", "plotting.scatter_matrix(df[['mstat', 'mstat2', 'econ', 'eng']]) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Коэффициенты корреляции" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Оценим связь между оценками за курс «Математика и статистика 1» и «Математика и статистика 2».\n", "\n", "$H_0: \\rho = 0$ (связи нет, коэффициент корреляции не является статистически значимым)\n", "\n", "$H_1: \\rho \\ne 0$ (связь есть, коэффцициент корреляции является статистически значимым)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(0.7557387472251619, 3.961005655367343e-11)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats.pearsonr(df.mstat, df.mstat2) # коэффициент Пирсона" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Интерпретация: связь прямая (положительная) и сильная, нулевая гипотеза о незначимости коэффициеинта корреляции отвергается (на любом конвенциональном уровне значимости), есть значимая связь между оценками по двум частям курса «Математика и статистика»." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SpearmanrResult(correlation=0.7631510956897434, pvalue=1.965305087090471e-11)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats.spearmanr(df.mstat, df.mstat2) # коэффициент Спирмена (более уместен здесь - шкала порядковая)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }