{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "tree = pd.read_csv(\"firtree.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Переименуем столбец `'Unnamed: 0'`:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "tree.rename(columns = {'Unnamed: 0':'id'}, \n", " inplace= True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Сгруппируем строки в датафрейме по столбцу `ftype`:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tree.groupby(\"ftype\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Посмотрим, что внутри объекта типа `GroupBy`:" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for g in tree.groupby(\"ftype\"):\n", " print(g)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Используем перебор по названию группы и датафрейму, соответствующему этой группе и сохраним строки для каждой группы в отдельный файл `.csv.`:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for name, dat in tree.groupby(\"ftype\"):\n", " dat.to_csv(name + \".csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Применим агрегирование – посчитаем число заполненных ячеек к каждом столбце по каждой группе:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idgenderheightscoreexpenseswish
ftype
ель обыкновенная258258258258258257
пихта Нобилис326326326326326325
сосна Крым327327327327327327
сосна датская289289289289289289
\n", "
" ], "text/plain": [ " id gender height score expenses wish\n", "ftype \n", "ель обыкновенная 258 258 258 258 258 257\n", "пихта Нобилис 326 326 326 326 326 325\n", "сосна Крым 327 327 327 327 327 327\n", "сосна датская 289 289 289 289 289 289" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tree.groupby(\"ftype\").agg('count')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "А теперь ещё и среднее:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idheightscoreexpenses
countmeancountmeancountmeancountmean
ftype
ель обыкновенная258575.895349258155.5038762582.9806202581603.813953
пихта Нобилис326623.076687326160.1012273263.0828223261634.463190
сосна Крым327596.599388327160.6574923272.9877683271572.076453
сосна датская289601.411765289159.2802772892.9584782891709.916955
\n", "
" ], "text/plain": [ " id height score \\\n", " count mean count mean count mean \n", "ftype \n", "ель обыкновенная 258 575.895349 258 155.503876 258 2.980620 \n", "пихта Нобилис 326 623.076687 326 160.101227 326 3.082822 \n", "сосна Крым 327 596.599388 327 160.657492 327 2.987768 \n", "сосна датская 289 601.411765 289 159.280277 289 2.958478 \n", "\n", " expenses \n", " count mean \n", "ftype \n", "ель обыкновенная 258 1603.813953 \n", "пихта Нобилис 326 1634.463190 \n", "сосна Крым 327 1572.076453 \n", "сосна датская 289 1709.916955 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tree.groupby(\"ftype\").agg(['count', 'mean'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "И медиану заодно:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idheightscoreexpenses
countmeanmediancountmeanmediancountmeanmediancountmeanmedian
ftype
ель обыкновенная258575.895349570.5258155.503876149.02582.98062032581603.8139531580
пихта Нобилис326623.076687639.5326160.101227159.53263.08282233261634.4631901634
сосна Крым327596.599388614.0327160.657492157.03272.98776833271572.0764531552
сосна датская289601.411765557.0289159.280277159.02892.95847832891709.9169551795
\n", "
" ], "text/plain": [ " id height score \\\n", " count mean median count mean median count \n", "ftype \n", "ель обыкновенная 258 575.895349 570.5 258 155.503876 149.0 258 \n", "пихта Нобилис 326 623.076687 639.5 326 160.101227 159.5 326 \n", "сосна Крым 327 596.599388 614.0 327 160.657492 157.0 327 \n", "сосна датская 289 601.411765 557.0 289 159.280277 159.0 289 \n", "\n", " expenses \n", " mean median count mean median \n", "ftype \n", "ель обыкновенная 2.980620 3 258 1603.813953 1580 \n", "пихта Нобилис 3.082822 3 326 1634.463190 1634 \n", "сосна Крым 2.987768 3 327 1572.076453 1552 \n", "сосна датская 2.958478 3 289 1709.916955 1795 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tree.groupby(\"ftype\").agg(['count', 'mean', 'median'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Теперь сообщим pandas, какую именно характеристику для какого именно столбца считать:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idheightscore
countmeanmedian
ftype
ель обыкновенная258155.5038763
пихта Нобилис326160.1012273
сосна Крым327160.6574923
сосна датская289159.2802773
\n", "
" ], "text/plain": [ " id height score\n", " count mean median\n", "ftype \n", "ель обыкновенная 258 155.503876 3\n", "пихта Нобилис 326 160.101227 3\n", "сосна Крым 327 160.657492 3\n", "сосна датская 289 159.280277 3" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res = tree.groupby(\"ftype\").agg({'id' : ['count'],\n", " 'height' : ['mean'], \n", " 'score' : ['median']})\n", "res" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Сохранили результат в переменную `res`. Посмотрим на её структуру: выведем названия столбцов и строк:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex([( 'id', 'count'),\n", " ('height', 'mean'),\n", " ( 'score', 'median')],\n", " )" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.columns" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['ель обыкновенная', 'пихта Нобилис', 'сосна Крым', 'сосна датская'], dtype='object', name='ftype')" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.index" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mean
ftype
ель обыкновенная155.503876
пихта Нобилис160.101227
сосна Крым160.657492
сосна датская159.280277
\n", "
" ], "text/plain": [ " mean\n", "ftype \n", "ель обыкновенная 155.503876\n", "пихта Нобилис 160.101227\n", "сосна Крым 160.657492\n", "сосна датская 159.280277" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res[\"height\"]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ftype\n", "ель обыкновенная 155.503876\n", "пихта Нобилис 160.101227\n", "сосна Крым 160.657492\n", "сосна датская 159.280277\n", "Name: mean, dtype: float64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res[\"height\"][\"mean\"]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id count 258.000000\n", "height mean 155.503876\n", "score median 3.000000\n", "Name: ель обыкновенная, dtype: float64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.loc['ель обыкновенная', :]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "res2 = tree.groupby([\"gender\", \"wish\"]).agg({'expenses': \n", " ['count', 'mean', 'median']})" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
expenses
countmeanmedian
genderwish
femaleда3071710.7817591754
нет3041567.0526321519
maleда3041659.3453951680
нет2831571.9151941512
\n", "
" ], "text/plain": [ " expenses \n", " count mean median\n", "gender wish \n", "female да 307 1710.781759 1754\n", " нет 304 1567.052632 1519\n", "male да 304 1659.345395 1680\n", " нет 283 1571.915194 1512" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res2" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex([('expenses', 'count'),\n", " ('expenses', 'mean'),\n", " ('expenses', 'median')],\n", " )" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res2.columns" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex([('female', 'да'),\n", " ('female', 'нет'),\n", " ( 'male', 'да'),\n", " ( 'male', 'нет')],\n", " names=['gender', 'wish'])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res2.index" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
expenses
countmeanmedian
gender
female3071710.7817591754
male3041659.3453951680
\n", "
" ], "text/plain": [ " expenses \n", " count mean median\n", "gender \n", "female 307 1710.781759 1754\n", "male 304 1659.345395 1680" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res2.xs('да', level = 'wish')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
expenses
countmeanmedian
wish
да3071710.7817591754
нет3041567.0526321519
\n", "
" ], "text/plain": [ " expenses \n", " count mean median\n", "wish \n", "да 307 1710.781759 1754\n", "нет 304 1567.052632 1519" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res2.xs('female', level = 'gender')" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "307" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res2.xs('female', level = 'gender')['expenses'].iloc[0,0]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "307" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res2.xs('female', level = 'gender')['expenses'].loc[\"да\", 'count']" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }