{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Эксперименты по ускорению обработки данных в Python\n", "\n", "* 2019, Александр Дьяконов (https://dyakonov.org/ag/)\n", "\n", "Частично использованы материалы\n", "\n", "* Глеба Маслякова https://nbviewer.jupyter.org/github/glebmaslyak/PZAD_Homeworks/blob/student/PZAD_feature_preprocessing_hw.ipynb \n", "\n", "* Дениса Бибика https://nbviewer.jupyter.org/github/den-bibik/PZAD/blob/master/optimize.ipynb" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Задача 1 - устранить знак доллара" ] }, { "cell_type": "code", "execution_count": 137, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pricefeature
017\$0.0
189\$0.0
239\$0.0
397\$0.0
423\$0.0
\n", "
" ], "text/plain": [ " price feature\n", "0 17\$ 0.0\n", "1 89\$ 0.0\n", "2 39\$ 0.0\n", "3 97\$ 0.0\n", "4 23\$ 0.0" ] }, "execution_count": 137, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def make_s(n_rows):\n", " tmp = pd.DataFrame({'price': (100*np.random.rand(n_rows)).astype(int), 'feature': np.zeros(n_rows)})\n", " tmp['price'] = tmp['price'].astype(str) + '\$'\n", " return tmp\n", "\n", "data = make_s(5)\n", "data" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pricefeatureprice(\$)_v1price(\$)_v2price(\$)_v3price(\$)_v4
017\$0.017171717
189\$0.089898989
239\$0.039393939
397\$0.097979797
423\$0.023232323
\n", "
" ], "text/plain": [ " price feature price(\$)_v1 price(\$)_v2 price(\$)_v3 price(\$)_v4\n", "0 17\$ 0.0 17 17 17 17\n", "1 89\$ 0.0 89 89 89 89\n", "2 39\$ 0.0 39 39 39 39\n", "3 97\$ 0.0 97 97 97 97\n", "4 23\$ 0.0 23 23 23 23" ] }, "execution_count": 139, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['price(\$)_v1'] = data['price'].apply(lambda x: int(x[:-1]))\n", "data['price(\$)_v2'] = data['price'].apply(lambda x: x[:-1]).astype(int)\n", "data['price(\$)_v3'] = data['price'].apply(lambda x: x.replace('\$', '')).astype(int)\n", "data['price(\$)_v4'] = data['price'].str.replace('\$', '').astype(int)\n", "data" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [], "source": [ "data = make_s(10000000)" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 4.26 s, sys: 60 ms, total: 4.32 s\n", "Wall time: 4.36 s\n" ] } ], "source": [ "%%time\n", "data['price(\$))_v1'] = data['price'].apply(lambda x: int(x[:-1]))\n", "# 4.2-4.33" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.5 s, sys: 124 ms, total: 2.62 s\n", "Wall time: 2.65 s\n" ] } ], "source": [ "%%time\n", "data['price(\$)_v2'] = data['price'].apply(lambda x: x[:-1]).astype(int)\n", "# 2.47 s - 2.52 s" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.19 s, sys: 152 ms, total: 3.34 s\n", "Wall time: 3.35 s\n" ] } ], "source": [ "%%time\n", "data['price(\$)_v3'] = data['price'].apply(lambda x: x.replace('\$', '')).astype(int) \n", "# 3.14 - 3.31" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.43 s, sys: 164 ms, total: 3.6 s\n", "Wall time: 3.63 s\n" ] } ], "source": [ "%%time\n", "data['price(\$)_v4'] = data['price'].str.replace('\$', '').astype(int)\n", "# 3.43 - 4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Задача - бинаризовать" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
typefeature
0A0.0
1B0.0
2A0.0
3A0.0
4A0.0
\n", "
" ], "text/plain": [ " type feature\n", "0 A 0.0\n", "1 B 0.0\n", "2 A 0.0\n", "3 A 0.0\n", "4 A 0.0" ] }, "execution_count": 150, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def make_t(n_rows):\n", " tmp = pd.DataFrame({'type': np.where(np.random.rand(n_rows)<0.5, 'A', 'B'), 'feature': np.zeros(n_rows)})\n", " return tmp\n", "\n", "data = make_t(5)\n", "data" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
typefeaturetype_v1type_v2type_v3type_v4type_v5type_v6type_v7
0A0.01111010
1B0.00000101
2A0.01111010
3A0.01111010
4A0.01111010
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
featureA/B
00.023/60
10.065/76
20.066/53
30.057/53
40.085/18
\n", "
" ], "text/plain": [ " feature A/B\n", "0 0.0 23/60\n", "1 0.0 65/76\n", "2 0.0 66/53\n", "3 0.0 57/53\n", "4 0.0 85/18" ] }, "execution_count": 169, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def make_ab(n_rows):\n", " tmp = pd.DataFrame({'A': (100*np.random.rand(n_rows)).astype(int), 'B': (100*np.random.rand(n_rows)).astype(int), 'feature': np.zeros(n_rows)})\n", " tmp['A/B'] = tmp['A'].astype(str) + '/' + tmp['B'].astype(str)\n", " del tmp['A']\n", " del tmp['B']\n", " return tmp\n", "\n", "data = make_ab(5)\n", "data" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
featureA/BA_v1B_v1A_v2B_v2A_v3B_v3A_v4B_v4
00.023/602360236023602360
10.065/766576657665766576
20.066/536653665366536653
30.057/535753575357535753
40.085/188518851885188518
\n", "
" ], "text/plain": [ " feature A/B A_v1 B_v1 A_v2 B_v2 A_v3 B_v3 A_v4 B_v4\n", "0 0.0 23/60 23 60 23 60 23 60 23 60\n", "1 0.0 65/76 65 76 65 76 65 76 65 76\n", "2 0.0 66/53 66 53 66 53 66 53 66 53\n", "3 0.0 57/53 57 53 57 53 57 53 57 53\n", "4 0.0 85/18 85 18 85 18 85 18 85 18" ] }, "execution_count": 171, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tmp = data['A/B'].str.split('/')\n", "data['A_v1'] = tmp.apply(lambda x: x[0])\n", "data['B_v1'] = tmp.apply(lambda x: x[1])\n", "\n", "data[['A_v2', 'B_v2']] = pd.DataFrame(data['A/B'].str.split('/', 1).tolist())\n", "\n", "data[['A_v3', 'B_v3']] = data['A/B'].str.split('/', expand=True)\n", "\n", "st = '/'.join(data['A/B'])\n", "data[['A_v4', 'B_v4']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))\n", "\n", "data" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [], "source": [ "data = make_ab(10000000)" ] }, { "cell_type": "code", "execution_count": 177, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 13.1 s, sys: 1.51 s, total: 14.6 s\n", "Wall time: 14.6 s\n" ] } ], "source": [ "%%time\n", "tmp = data['A/B'].str.split('/')\n", "data['A_v1'] = tmp.apply(lambda x: x[0])\n", "data['B_v1'] = tmp.apply(lambda x: x[1])\n", "# 12.5 s-13.4 s" ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 9.94 s, sys: 176 ms, total: 10.1 s\n", "Wall time: 10.2 s\n" ] } ], "source": [ "%%time\n", "data[['A_v2', 'B_v2']] = pd.DataFrame(data['A/B'].str.split('/', 1).tolist())\n", "# 10.2 s - 12.2 s" ] }, { "cell_type": "code", "execution_count": 175, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 26.1 s, sys: 368 ms, total: 26.5 s\n", "Wall time: 26.5 s\n" ] } ], "source": [ "%%time\n", "data[['A_v3', 'B_v3']] = data['A/B'].str.split('/', expand=True)\n", "# 26.1 s - 29.2 s" ] }, { "cell_type": "code", "execution_count": 176, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.65 s, sys: 168 ms, total: 3.82 s\n", "Wall time: 3.84 s\n" ] } ], "source": [ "%%time\n", "st = '/'.join(data['A/B'])\n", "data[['A_v4', 'B_v4']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))\n", "# 3.65 s - 4.54 s" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Задача - заменить пропуски средним" ] }, { "cell_type": "code", "execution_count": 178, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
typefeaturefeature_v1feature_v2feature_v3feature_v4
0testNaNNaNNaNNaNNaN
1test43.043.043.043.043.0
2testNaNNaNNaNNaNNaN
3train4.04.04.04.04.0
4train18.018.018.018.018.0
5trainNaNNaNNaNNaNNaN
6trainNaNNaNNaNNaNNaN
7train25.025.025.025.025.0
8trainNaNNaNNaNNaNNaN
9trainNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " type feature feature_v1 feature_v2 feature_v3 feature_v4\n", "0 test NaN NaN NaN NaN NaN\n", "1 test 43.0 43.0 43.0 43.0 43.0\n", "2 test NaN NaN NaN NaN NaN\n", "3 train 4.0 4.0 4.0 4.0 4.0\n", "4 train 18.0 18.0 18.0 18.0 18.0\n", "5 train NaN NaN NaN NaN NaN\n", "6 train NaN NaN NaN NaN NaN\n", "7 train 25.0 25.0 25.0 25.0 25.0\n", "8 train NaN NaN NaN NaN NaN\n", "9 train NaN NaN NaN NaN NaN" ] }, "execution_count": 178, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def make_t(n_rows):\n", " tmp = pd.DataFrame({'type': np.where(np.random.rand(n_rows)<0.5, 'train', 'test'),\n", " 'feature': np.where(np.random.rand(n_rows)<0.5, (100*np.random.rand(n_rows)).astype(int), np.nan)})\n", " tmp['feature_v1'] = tmp['feature']\n", " tmp['feature_v2'] = tmp['feature']\n", " tmp['feature_v3'] = tmp['feature']\n", " tmp['feature_v4'] = tmp['feature']\n", " return tmp\n", "\n", "data = make_t(10)\n", "data" ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
typefeaturefeature_v1feature_v2feature_v3feature_v4
0testNaN43.00000043.00000043.00000043.000000
1test43.043.00000043.00000043.00000043.000000
2testNaN43.00000043.00000043.00000043.000000
3train4.04.0000004.0000004.0000004.000000
4train18.018.00000018.00000018.00000018.000000
5trainNaN15.66666715.66666715.66666715.666667
6trainNaN15.66666715.66666715.66666715.666667
7train25.025.00000025.00000025.00000025.000000
8trainNaN15.66666715.66666715.66666715.666667
9trainNaN15.66666715.66666715.66666715.666667
\n", "
" ], "text/plain": [ " type feature feature_v1 feature_v2 feature_v3 feature_v4\n", "0 test NaN 43.000000 43.000000 43.000000 43.000000\n", "1 test 43.0 43.000000 43.000000 43.000000 43.000000\n", "2 test NaN 43.000000 43.000000 43.000000 43.000000\n", "3 train 4.0 4.000000 4.000000 4.000000 4.000000\n", "4 train 18.0 18.000000 18.000000 18.000000 18.000000\n", "5 train NaN 15.666667 15.666667 15.666667 15.666667\n", "6 train NaN 15.666667 15.666667 15.666667 15.666667\n", "7 train 25.0 25.000000 25.000000 25.000000 25.000000\n", "8 train NaN 15.666667 15.666667 15.666667 15.666667\n", "9 train NaN 15.666667 15.666667 15.666667 15.666667" ] }, "execution_count": 182, "metadata": {}, "output_type": "execute_result" } ], "source": [ "name = 'feature_v1'\n", "data.loc[data['type'] == 'test', name] = \\\n", " data[data['type'] == 'test'][name].fillna(data[data['type'] == 'test'][name].mean())\n", "data.loc[data['type'] == 'train', name] = \\\n", " data[data['type'] == 'train'][name].fillna(data[data['type'] == 'train'][name].mean())\n", "\n", "name = 'feature_v2'\n", "data[name] = data.groupby('type')[name].transform(lambda x: x.fillna(x.mean()))\n", "\n", "name = 'feature_v3'\n", "data.loc[data[name].isnull(), name] = data.groupby('type')[name].transform('mean')\n", "\n", "name = 'feature_v4'\n", "data[name] = np.where(data[name].isnull(), data['type'].map(data.groupby('type')[name].mean()), data[name])\n", "\n", "#name = 'feature_v4'\n", "#gb = data.groupby('type')\n", "#mn = gb.mean()\n", "#for gn, x in gb:\n", "# x[name].fillna(mn.loc[gn], inplace=True)\n", "\n", "data" ] }, { "cell_type": "code", "execution_count": 191, "metadata": {}, "outputs": [], "source": [ "data = make_t(10000000)" ] }, { "cell_type": "code", "execution_count": 195, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.66 s, sys: 132 ms, total: 3.79 s\n", "Wall time: 3.81 s\n" ] } ], "source": [ "%%time\n", "\n", "name = 'feature_v1'\n", "data.loc[data['type'] == 'test', name] = data[data['type'] == 'test'][name].fillna(data[data['type'] == 'test'][name].mean())\n", "data.loc[data['type'] == 'train', name] = data[data['type'] == 'train'][name].fillna(data[data['type'] == 'train'][name].mean())\n", "\n", "# 3.44 s - 3.84 s" ] }, { "cell_type": "code", "execution_count": 194, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.9 s, sys: 152 ms, total: 2.05 s\n", "Wall time: 2.06 s\n" ] } ], "source": [ "%%time\n", "name = 'feature_v2'\n", "data[name] = data.groupby('type')[name].transform(lambda x: x.fillna(x.mean()))\n", "# 1.9 s - 2.04 s" ] }, { "cell_type": "code", "execution_count": 193, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.2 s, sys: 128 ms, total: 1.32 s\n", "Wall time: 1.35 s\n" ] } ], "source": [ "%%time\n", "name = 'feature_v3'\n", "data.loc[data[name].isnull(), name] = data.groupby('type')[name].transform('mean')\n", "# 1.17 - 1.18 s" ] }, { "cell_type": "code", "execution_count": 192, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.37 s, sys: 72 ms, total: 1.44 s\n", "Wall time: 1.45 s\n" ] } ], "source": [ "%%time\n", "name = 'feature_v4'\n", "data[name] = np.where(data[name].isnull(), data['type'].map(data.groupby('type')[name].mean()), data[name])\n", "# 1.26 s - 1.38" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }