{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Эксперименты по ускорению обработки данных в Python\n",
"\n",
"* 2019, Александр Дьяконов (https://dyakonov.org/ag/)\n",
"\n",
"Частично использованы материалы\n",
"\n",
"* Глеба Маслякова https://nbviewer.jupyter.org/github/glebmaslyak/PZAD_Homeworks/blob/student/PZAD_feature_preprocessing_hw.ipynb \n",
"\n",
"* Дениса Бибика https://nbviewer.jupyter.org/github/den-bibik/PZAD/blob/master/optimize.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Задача 1 - устранить знак доллара"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" price | \n",
" feature | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 17$ | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 89$ | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 39$ | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 97$ | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 23$ | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" price feature\n",
"0 17$ 0.0\n",
"1 89$ 0.0\n",
"2 39$ 0.0\n",
"3 97$ 0.0\n",
"4 23$ 0.0"
]
},
"execution_count": 137,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def make_s(n_rows):\n",
" tmp = pd.DataFrame({'price': (100*np.random.rand(n_rows)).astype(int), 'feature': np.zeros(n_rows)})\n",
" tmp['price'] = tmp['price'].astype(str) + '$'\n",
" return tmp\n",
"\n",
"data = make_s(5)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" price | \n",
" feature | \n",
" price($)_v1 | \n",
" price($)_v2 | \n",
" price($)_v3 | \n",
" price($)_v4 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 17$ | \n",
" 0.0 | \n",
" 17 | \n",
" 17 | \n",
" 17 | \n",
" 17 | \n",
"
\n",
" \n",
" 1 | \n",
" 89$ | \n",
" 0.0 | \n",
" 89 | \n",
" 89 | \n",
" 89 | \n",
" 89 | \n",
"
\n",
" \n",
" 2 | \n",
" 39$ | \n",
" 0.0 | \n",
" 39 | \n",
" 39 | \n",
" 39 | \n",
" 39 | \n",
"
\n",
" \n",
" 3 | \n",
" 97$ | \n",
" 0.0 | \n",
" 97 | \n",
" 97 | \n",
" 97 | \n",
" 97 | \n",
"
\n",
" \n",
" 4 | \n",
" 23$ | \n",
" 0.0 | \n",
" 23 | \n",
" 23 | \n",
" 23 | \n",
" 23 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" price feature price($)_v1 price($)_v2 price($)_v3 price($)_v4\n",
"0 17$ 0.0 17 17 17 17\n",
"1 89$ 0.0 89 89 89 89\n",
"2 39$ 0.0 39 39 39 39\n",
"3 97$ 0.0 97 97 97 97\n",
"4 23$ 0.0 23 23 23 23"
]
},
"execution_count": 139,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['price($)_v1'] = data['price'].apply(lambda x: int(x[:-1]))\n",
"data['price($)_v2'] = data['price'].apply(lambda x: x[:-1]).astype(int)\n",
"data['price($)_v3'] = data['price'].apply(lambda x: x.replace('$', '')).astype(int)\n",
"data['price($)_v4'] = data['price'].str.replace('$', '').astype(int)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [],
"source": [
"data = make_s(10000000)"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.26 s, sys: 60 ms, total: 4.32 s\n",
"Wall time: 4.36 s\n"
]
}
],
"source": [
"%%time\n",
"data['price($))_v1'] = data['price'].apply(lambda x: int(x[:-1]))\n",
"# 4.2-4.33"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.5 s, sys: 124 ms, total: 2.62 s\n",
"Wall time: 2.65 s\n"
]
}
],
"source": [
"%%time\n",
"data['price($)_v2'] = data['price'].apply(lambda x: x[:-1]).astype(int)\n",
"# 2.47 s - 2.52 s"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.19 s, sys: 152 ms, total: 3.34 s\n",
"Wall time: 3.35 s\n"
]
}
],
"source": [
"%%time\n",
"data['price($)_v3'] = data['price'].apply(lambda x: x.replace('$', '')).astype(int) \n",
"# 3.14 - 3.31"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.43 s, sys: 164 ms, total: 3.6 s\n",
"Wall time: 3.63 s\n"
]
}
],
"source": [
"%%time\n",
"data['price($)_v4'] = data['price'].str.replace('$', '').astype(int)\n",
"# 3.43 - 4"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Задача - бинаризовать"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" type | \n",
" feature | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" B | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" A | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" A | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" type feature\n",
"0 A 0.0\n",
"1 B 0.0\n",
"2 A 0.0\n",
"3 A 0.0\n",
"4 A 0.0"
]
},
"execution_count": 150,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def make_t(n_rows):\n",
" tmp = pd.DataFrame({'type': np.where(np.random.rand(n_rows)<0.5, 'A', 'B'), 'feature': np.zeros(n_rows)})\n",
" return tmp\n",
"\n",
"data = make_t(5)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" type | \n",
" feature | \n",
" type_v1 | \n",
" type_v2 | \n",
" type_v3 | \n",
" type_v4 | \n",
" type_v5 | \n",
" type_v6 | \n",
" type_v7 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A | \n",
" 0.0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" B | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" A | \n",
" 0.0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" A | \n",
" 0.0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" A | \n",
" 0.0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" type feature type_v1 type_v2 type_v3 type_v4 type_v5 type_v6 type_v7\n",
"0 A 0.0 1 1 1 1 0 1 0\n",
"1 B 0.0 0 0 0 0 1 0 1\n",
"2 A 0.0 1 1 1 1 0 1 0\n",
"3 A 0.0 1 1 1 1 0 1 0\n",
"4 A 0.0 1 1 1 1 0 1 0"
]
},
"execution_count": 155,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['type_v1'] = data['type'].apply(lambda x: 1 if x == \"A\" else 0)\n",
"data['type_v2'] = (data['type']=='A').astype(int)\n",
"data['type_v3'] = np.where(data['type'] == 'A', 1 ,0)\n",
"data['type_v4'] = data['type'].map({'A': 1, 'B': 0})\n",
"data['type_v5'] = data['type'].factorize()[0] # некорректный ответ\n",
"data['type_v6'] = pd.get_dummies(data['type'])['A'] # uint8!!!\n",
"from sklearn import preprocessing\n",
"data['type_v7'] = preprocessing.LabelEncoder().fit_transform(data['type']) # некорректный ответ\n",
"\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {},
"outputs": [],
"source": [
"data = make_t(10000000)"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.14 s, sys: 72 ms, total: 2.22 s\n",
"Wall time: 2.25 s\n"
]
}
],
"source": [
"%%time\n",
"data['type_v1'] = data['type'].apply(lambda x: 1 if x == \"A\" else 0)\n",
"# 2.14 s - 2.18 s"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.82 s, sys: 60 ms, total: 1.88 s\n",
"Wall time: 1.89 s\n"
]
}
],
"source": [
"%%time\n",
"data['type_v1'] = data['type'].apply(lambda x: \"1\" if x == \"A\" else \"0\").astype(int)\n",
"# 1.82 s - 1.86 s"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 364 ms, sys: 20 ms, total: 384 ms\n",
"Wall time: 386 ms\n"
]
}
],
"source": [
"%%time\n",
"data['type_v2'] = (data['type']=='A').astype(int)\n",
"# 348 - 364"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 376 ms, sys: 20 ms, total: 396 ms\n",
"Wall time: 398 ms\n"
]
}
],
"source": [
"%%time\n",
"data['type_v3'] = np.where(data['type'] == 'A', 1 ,0)\n",
"# 380-398"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 420 ms, sys: 16 ms, total: 436 ms\n",
"Wall time: 443 ms\n"
]
}
],
"source": [
"%%time\n",
"data['type_v4'] = data['type'].map({'A': 1, 'B': 0})\n",
"# 400-424"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 304 ms, sys: 36 ms, total: 340 ms\n",
"Wall time: 357 ms\n"
]
}
],
"source": [
"%%time\n",
"data['type_v5'] = data['type'].factorize()[0]\n",
"# 304-324"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 364 ms, sys: 28 ms, total: 392 ms\n",
"Wall time: 395 ms\n"
]
}
],
"source": [
"%%time\n",
"data['type_v6'] = pd.get_dummies(data['type'])['A']\n",
"# 360-392"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 5.5 s, sys: 36 ms, total: 5.53 s\n",
"Wall time: 5.52 s\n"
]
}
],
"source": [
"%%time\n",
"from sklearn import preprocessing\n",
"data['type_v7'] = preprocessing.LabelEncoder().fit_transform(data['type']) # некорректный ответ\n",
"# 5.47 s - 5.57 s"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Задача - расщепить"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" feature | \n",
" A/B | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.0 | \n",
" 23/60 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.0 | \n",
" 65/76 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" 66/53 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.0 | \n",
" 57/53 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.0 | \n",
" 85/18 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" feature A/B\n",
"0 0.0 23/60\n",
"1 0.0 65/76\n",
"2 0.0 66/53\n",
"3 0.0 57/53\n",
"4 0.0 85/18"
]
},
"execution_count": 169,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def make_ab(n_rows):\n",
" tmp = pd.DataFrame({'A': (100*np.random.rand(n_rows)).astype(int), 'B': (100*np.random.rand(n_rows)).astype(int), 'feature': np.zeros(n_rows)})\n",
" tmp['A/B'] = tmp['A'].astype(str) + '/' + tmp['B'].astype(str)\n",
" del tmp['A']\n",
" del tmp['B']\n",
" return tmp\n",
"\n",
"data = make_ab(5)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" feature | \n",
" A/B | \n",
" A_v1 | \n",
" B_v1 | \n",
" A_v2 | \n",
" B_v2 | \n",
" A_v3 | \n",
" B_v3 | \n",
" A_v4 | \n",
" B_v4 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.0 | \n",
" 23/60 | \n",
" 23 | \n",
" 60 | \n",
" 23 | \n",
" 60 | \n",
" 23 | \n",
" 60 | \n",
" 23 | \n",
" 60 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.0 | \n",
" 65/76 | \n",
" 65 | \n",
" 76 | \n",
" 65 | \n",
" 76 | \n",
" 65 | \n",
" 76 | \n",
" 65 | \n",
" 76 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" 66/53 | \n",
" 66 | \n",
" 53 | \n",
" 66 | \n",
" 53 | \n",
" 66 | \n",
" 53 | \n",
" 66 | \n",
" 53 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.0 | \n",
" 57/53 | \n",
" 57 | \n",
" 53 | \n",
" 57 | \n",
" 53 | \n",
" 57 | \n",
" 53 | \n",
" 57 | \n",
" 53 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.0 | \n",
" 85/18 | \n",
" 85 | \n",
" 18 | \n",
" 85 | \n",
" 18 | \n",
" 85 | \n",
" 18 | \n",
" 85 | \n",
" 18 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" feature A/B A_v1 B_v1 A_v2 B_v2 A_v3 B_v3 A_v4 B_v4\n",
"0 0.0 23/60 23 60 23 60 23 60 23 60\n",
"1 0.0 65/76 65 76 65 76 65 76 65 76\n",
"2 0.0 66/53 66 53 66 53 66 53 66 53\n",
"3 0.0 57/53 57 53 57 53 57 53 57 53\n",
"4 0.0 85/18 85 18 85 18 85 18 85 18"
]
},
"execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tmp = data['A/B'].str.split('/')\n",
"data['A_v1'] = tmp.apply(lambda x: x[0])\n",
"data['B_v1'] = tmp.apply(lambda x: x[1])\n",
"\n",
"data[['A_v2', 'B_v2']] = pd.DataFrame(data['A/B'].str.split('/', 1).tolist())\n",
"\n",
"data[['A_v3', 'B_v3']] = data['A/B'].str.split('/', expand=True)\n",
"\n",
"st = '/'.join(data['A/B'])\n",
"data[['A_v4', 'B_v4']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))\n",
"\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [],
"source": [
"data = make_ab(10000000)"
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 13.1 s, sys: 1.51 s, total: 14.6 s\n",
"Wall time: 14.6 s\n"
]
}
],
"source": [
"%%time\n",
"tmp = data['A/B'].str.split('/')\n",
"data['A_v1'] = tmp.apply(lambda x: x[0])\n",
"data['B_v1'] = tmp.apply(lambda x: x[1])\n",
"# 12.5 s-13.4 s"
]
},
{
"cell_type": "code",
"execution_count": 174,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 9.94 s, sys: 176 ms, total: 10.1 s\n",
"Wall time: 10.2 s\n"
]
}
],
"source": [
"%%time\n",
"data[['A_v2', 'B_v2']] = pd.DataFrame(data['A/B'].str.split('/', 1).tolist())\n",
"# 10.2 s - 12.2 s"
]
},
{
"cell_type": "code",
"execution_count": 175,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 26.1 s, sys: 368 ms, total: 26.5 s\n",
"Wall time: 26.5 s\n"
]
}
],
"source": [
"%%time\n",
"data[['A_v3', 'B_v3']] = data['A/B'].str.split('/', expand=True)\n",
"# 26.1 s - 29.2 s"
]
},
{
"cell_type": "code",
"execution_count": 176,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.65 s, sys: 168 ms, total: 3.82 s\n",
"Wall time: 3.84 s\n"
]
}
],
"source": [
"%%time\n",
"st = '/'.join(data['A/B'])\n",
"data[['A_v4', 'B_v4']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))\n",
"# 3.65 s - 4.54 s"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Задача - заменить пропуски средним"
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" type | \n",
" feature | \n",
" feature_v1 | \n",
" feature_v2 | \n",
" feature_v3 | \n",
" feature_v4 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" test | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" test | \n",
" 43.0 | \n",
" 43.0 | \n",
" 43.0 | \n",
" 43.0 | \n",
" 43.0 | \n",
"
\n",
" \n",
" 2 | \n",
" test | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" train | \n",
" 4.0 | \n",
" 4.0 | \n",
" 4.0 | \n",
" 4.0 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 4 | \n",
" train | \n",
" 18.0 | \n",
" 18.0 | \n",
" 18.0 | \n",
" 18.0 | \n",
" 18.0 | \n",
"
\n",
" \n",
" 5 | \n",
" train | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 6 | \n",
" train | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 7 | \n",
" train | \n",
" 25.0 | \n",
" 25.0 | \n",
" 25.0 | \n",
" 25.0 | \n",
" 25.0 | \n",
"
\n",
" \n",
" 8 | \n",
" train | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 9 | \n",
" train | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" type feature feature_v1 feature_v2 feature_v3 feature_v4\n",
"0 test NaN NaN NaN NaN NaN\n",
"1 test 43.0 43.0 43.0 43.0 43.0\n",
"2 test NaN NaN NaN NaN NaN\n",
"3 train 4.0 4.0 4.0 4.0 4.0\n",
"4 train 18.0 18.0 18.0 18.0 18.0\n",
"5 train NaN NaN NaN NaN NaN\n",
"6 train NaN NaN NaN NaN NaN\n",
"7 train 25.0 25.0 25.0 25.0 25.0\n",
"8 train NaN NaN NaN NaN NaN\n",
"9 train NaN NaN NaN NaN NaN"
]
},
"execution_count": 178,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def make_t(n_rows):\n",
" tmp = pd.DataFrame({'type': np.where(np.random.rand(n_rows)<0.5, 'train', 'test'),\n",
" 'feature': np.where(np.random.rand(n_rows)<0.5, (100*np.random.rand(n_rows)).astype(int), np.nan)})\n",
" tmp['feature_v1'] = tmp['feature']\n",
" tmp['feature_v2'] = tmp['feature']\n",
" tmp['feature_v3'] = tmp['feature']\n",
" tmp['feature_v4'] = tmp['feature']\n",
" return tmp\n",
"\n",
"data = make_t(10)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" type | \n",
" feature | \n",
" feature_v1 | \n",
" feature_v2 | \n",
" feature_v3 | \n",
" feature_v4 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" test | \n",
" NaN | \n",
" 43.000000 | \n",
" 43.000000 | \n",
" 43.000000 | \n",
" 43.000000 | \n",
"
\n",
" \n",
" 1 | \n",
" test | \n",
" 43.0 | \n",
" 43.000000 | \n",
" 43.000000 | \n",
" 43.000000 | \n",
" 43.000000 | \n",
"
\n",
" \n",
" 2 | \n",
" test | \n",
" NaN | \n",
" 43.000000 | \n",
" 43.000000 | \n",
" 43.000000 | \n",
" 43.000000 | \n",
"
\n",
" \n",
" 3 | \n",
" train | \n",
" 4.0 | \n",
" 4.000000 | \n",
" 4.000000 | \n",
" 4.000000 | \n",
" 4.000000 | \n",
"
\n",
" \n",
" 4 | \n",
" train | \n",
" 18.0 | \n",
" 18.000000 | \n",
" 18.000000 | \n",
" 18.000000 | \n",
" 18.000000 | \n",
"
\n",
" \n",
" 5 | \n",
" train | \n",
" NaN | \n",
" 15.666667 | \n",
" 15.666667 | \n",
" 15.666667 | \n",
" 15.666667 | \n",
"
\n",
" \n",
" 6 | \n",
" train | \n",
" NaN | \n",
" 15.666667 | \n",
" 15.666667 | \n",
" 15.666667 | \n",
" 15.666667 | \n",
"
\n",
" \n",
" 7 | \n",
" train | \n",
" 25.0 | \n",
" 25.000000 | \n",
" 25.000000 | \n",
" 25.000000 | \n",
" 25.000000 | \n",
"
\n",
" \n",
" 8 | \n",
" train | \n",
" NaN | \n",
" 15.666667 | \n",
" 15.666667 | \n",
" 15.666667 | \n",
" 15.666667 | \n",
"
\n",
" \n",
" 9 | \n",
" train | \n",
" NaN | \n",
" 15.666667 | \n",
" 15.666667 | \n",
" 15.666667 | \n",
" 15.666667 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" type feature feature_v1 feature_v2 feature_v3 feature_v4\n",
"0 test NaN 43.000000 43.000000 43.000000 43.000000\n",
"1 test 43.0 43.000000 43.000000 43.000000 43.000000\n",
"2 test NaN 43.000000 43.000000 43.000000 43.000000\n",
"3 train 4.0 4.000000 4.000000 4.000000 4.000000\n",
"4 train 18.0 18.000000 18.000000 18.000000 18.000000\n",
"5 train NaN 15.666667 15.666667 15.666667 15.666667\n",
"6 train NaN 15.666667 15.666667 15.666667 15.666667\n",
"7 train 25.0 25.000000 25.000000 25.000000 25.000000\n",
"8 train NaN 15.666667 15.666667 15.666667 15.666667\n",
"9 train NaN 15.666667 15.666667 15.666667 15.666667"
]
},
"execution_count": 182,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"name = 'feature_v1'\n",
"data.loc[data['type'] == 'test', name] = \\\n",
" data[data['type'] == 'test'][name].fillna(data[data['type'] == 'test'][name].mean())\n",
"data.loc[data['type'] == 'train', name] = \\\n",
" data[data['type'] == 'train'][name].fillna(data[data['type'] == 'train'][name].mean())\n",
"\n",
"name = 'feature_v2'\n",
"data[name] = data.groupby('type')[name].transform(lambda x: x.fillna(x.mean()))\n",
"\n",
"name = 'feature_v3'\n",
"data.loc[data[name].isnull(), name] = data.groupby('type')[name].transform('mean')\n",
"\n",
"name = 'feature_v4'\n",
"data[name] = np.where(data[name].isnull(), data['type'].map(data.groupby('type')[name].mean()), data[name])\n",
"\n",
"#name = 'feature_v4'\n",
"#gb = data.groupby('type')\n",
"#mn = gb.mean()\n",
"#for gn, x in gb:\n",
"# x[name].fillna(mn.loc[gn], inplace=True)\n",
"\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 191,
"metadata": {},
"outputs": [],
"source": [
"data = make_t(10000000)"
]
},
{
"cell_type": "code",
"execution_count": 195,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.66 s, sys: 132 ms, total: 3.79 s\n",
"Wall time: 3.81 s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"name = 'feature_v1'\n",
"data.loc[data['type'] == 'test', name] = data[data['type'] == 'test'][name].fillna(data[data['type'] == 'test'][name].mean())\n",
"data.loc[data['type'] == 'train', name] = data[data['type'] == 'train'][name].fillna(data[data['type'] == 'train'][name].mean())\n",
"\n",
"# 3.44 s - 3.84 s"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.9 s, sys: 152 ms, total: 2.05 s\n",
"Wall time: 2.06 s\n"
]
}
],
"source": [
"%%time\n",
"name = 'feature_v2'\n",
"data[name] = data.groupby('type')[name].transform(lambda x: x.fillna(x.mean()))\n",
"# 1.9 s - 2.04 s"
]
},
{
"cell_type": "code",
"execution_count": 193,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.2 s, sys: 128 ms, total: 1.32 s\n",
"Wall time: 1.35 s\n"
]
}
],
"source": [
"%%time\n",
"name = 'feature_v3'\n",
"data.loc[data[name].isnull(), name] = data.groupby('type')[name].transform('mean')\n",
"# 1.17 - 1.18 s"
]
},
{
"cell_type": "code",
"execution_count": 192,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.37 s, sys: 72 ms, total: 1.44 s\n",
"Wall time: 1.45 s\n"
]
}
],
"source": [
"%%time\n",
"name = 'feature_v4'\n",
"data[name] = np.where(data[name].isnull(), data['type'].map(data.groupby('type')[name].mean()), data[name])\n",
"# 1.26 s - 1.38"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}