{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выполнил Бибик Денис, ММП."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оптимизация кода для препроцессинга данных"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Часть 1.Корректировка данных, слайд 13"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Сгенерируем похожие данные"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"num_elenments = 3000000\n",
"\n",
"measures = [str(np.random.choice(range(100, 200))) + '/' + str(np.random.choice(range(80, 100))) \n",
" for i in range(num_elenments)]\n",
"df_ = pd.DataFrame(measures, columns = ['Давление'])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Давление | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 195/87 | \n",
"
\n",
" \n",
" 1 | \n",
" 116/93 | \n",
"
\n",
" \n",
" 2 | \n",
" 146/91 | \n",
"
\n",
" \n",
" 3 | \n",
" 172/84 | \n",
"
\n",
" \n",
" 4 | \n",
" 147/98 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Давление\n",
"0 195/87\n",
"1 116/93\n",
"2 146/91\n",
"3 172/84\n",
"4 147/98"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Перед каждым экспирементом будем копировать датасет"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = df_.copy()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 8.89 s, sys: 588 ms, total: 9.48 s\n",
"Wall time: 9.47 s\n"
]
}
],
"source": [
"%%time\n",
"#Оригинальный код\n",
"tmp = df['Давление'].str.split('/')\n",
"df['в.давл.'] = tmp.apply(lambda x: x[0])\n",
"df['н.давл.'] = tmp.apply(lambda x: x[1])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = df_.copy()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 8.88 s, sys: 644 ms, total: 9.52 s\n",
"Wall time: 9.44 s\n"
]
}
],
"source": [
"%%time\n",
"#Попробуем избавится от apply\n",
"tmp = df['Давление'].str.split('/').str\n",
"df['в.давл.'] = tmp.get(0)\n",
"df['н.давл.'] = tmp.get(1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = df_.copy()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.82 s, sys: 164 ms, total: 4.99 s\n",
"Wall time: 4.99 s\n"
]
}
],
"source": [
"%%time\n",
"#Теперь импользуем встроенную параметр expand\n",
"df[['в.давл.', 'н.давл.']] = df['Давление'].str.split('/', expand=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ускорение почти в два раза"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = df_.copy()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 963 ms, sys: 164 ms, total: 1.13 s\n",
"Wall time: 1.13 s\n"
]
}
],
"source": [
"%%time\n",
"#Попробуем такой вариант, он не будет работать если будут ошибки в формате\n",
"st = '/'.join(df['Давление'])\n",
"df[['в.давл.', 'н.давл.']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ускоорение в 8 раз"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Часть 2. Заполнение данных, слайд 19"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Сгенерируем похожие данные"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"num_elements = 5000000\n",
"train_len = int(0.75 * num_elements)\n",
"num_nan = 300000\n",
"df_ = pd.DataFrame(np.random.choice(a = range(60, 100), size = (num_elements, 1)), columns = ['площадь'])\n",
"df_['data'] = ['train'] * train_len + ['test'] * (num_elements - train_len)\n",
"nan_positions = np.random.choice(num_elements, num_nan, replace=False)\n",
"df_.loc[nan_positions, 'площадь'] = np.nan"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" площадь | \n",
" data | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 72.0 | \n",
" train | \n",
"
\n",
" \n",
" 1 | \n",
" 73.0 | \n",
" train | \n",
"
\n",
" \n",
" 2 | \n",
" 93.0 | \n",
" train | \n",
"
\n",
" \n",
" 3 | \n",
" 69.0 | \n",
" train | \n",
"
\n",
" \n",
" 4 | \n",
" 80.0 | \n",
" train | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" площадь data\n",
"0 72.0 train\n",
"1 73.0 train\n",
"2 93.0 train\n",
"3 69.0 train\n",
"4 80.0 train"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = df_.copy()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.58 s, sys: 132 ms, total: 2.71 s\n",
"Wall time: 2.71 s\n"
]
}
],
"source": [
"%%time\n",
"#Оригинальный код\n",
"df.loc[df['data'] == 'train', 'площадь'] =\\\n",
"df[df['data'] == 'train']['площадь'].fillna(df[df['data'] == 'train']['площадь'].mean())\n",
"df.loc[df['data'] == 'test', 'площадь'] =\\\n",
"df[df['data'] == 'test']['площадь'].fillna(df[df['data'] == 'test']['площадь'].mean())"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = df_.copy()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 849 ms, sys: 304 ms, total: 1.15 s\n",
"Wall time: 1.15 s\n"
]
}
],
"source": [
"%%time\n",
"#Используем transform\n",
"gb = df.groupby('data')\n",
"df['площадь'] = gb.transform(lambda x: x.fillna(x.mean()))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ускорение более чем в два раза"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = df_.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разберемся что сколько времени занимает в коде выше"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 417 ms, sys: 120 ms, total: 537 ms\n",
"Wall time: 536 ms\n"
]
}
],
"source": [
"%%time\n",
"gb = df.groupby('data')\n",
"iterate = list(gb)\n",
"del iterate"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 37.6 ms, sys: 15.9 ms, total: 53.6 ms\n",
"Wall time: 52.4 ms\n"
]
}
],
"source": [
"%%time\n",
"m = gb.mean()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 628 ms, sys: 240 ms, total: 868 ms\n",
"Wall time: 867 ms\n"
]
}
],
"source": [
"%%time\n",
"df['площадь'] = gb.transform(lambda x: x.fillna(0))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Получается что transform занимает значимую часть времени"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = df_.copy()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 709 ms, sys: 188 ms, total: 897 ms\n",
"Wall time: 895 ms\n"
]
}
],
"source": [
"%%time\n",
"gb = df.groupby('data')\n",
"mean = gb.mean()\n",
"for gn, x in gb:\n",
" x['площадь'].fillna(mean.loc[gn])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ускорили еще на 250 мс"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}