{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выполнил Бибик Денис, ММП."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Оптимизация кода для препроцессинга данных"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Часть 1.Корректировка данных, слайд 13"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. Сгенерируем похожие данные"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "num_elenments = 3000000\n",
    "\n",
    "measures = [str(np.random.choice(range(100, 200))) + '/' + str(np.random.choice(range(80, 100))) \n",
    "            for i in range(num_elenments)]\n",
    "df_ = pd.DataFrame(measures, columns = ['Давление'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Давление</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>195/87</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>116/93</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>146/91</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>172/84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>147/98</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Давление\n",
       "0   195/87\n",
       "1   116/93\n",
       "2   146/91\n",
       "3   172/84\n",
       "4   147/98"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Перед каждым экспирементом будем копировать датасет"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = df_.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8.89 s, sys: 588 ms, total: 9.48 s\n",
      "Wall time: 9.47 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#Оригинальный код\n",
    "tmp = df['Давление'].str.split('/')\n",
    "df['в.давл.'] = tmp.apply(lambda x: x[0])\n",
    "df['н.давл.'] = tmp.apply(lambda x: x[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = df_.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8.88 s, sys: 644 ms, total: 9.52 s\n",
      "Wall time: 9.44 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#Попробуем избавится от apply\n",
    "tmp = df['Давление'].str.split('/').str\n",
    "df['в.давл.'] = tmp.get(0)\n",
    "df['н.давл.'] = tmp.get(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = df_.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.82 s, sys: 164 ms, total: 4.99 s\n",
      "Wall time: 4.99 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#Теперь импользуем встроенную параметр expand\n",
    "df[['в.давл.', 'н.давл.']] = df['Давление'].str.split('/', expand=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ускорение почти в два раза"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = df_.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 963 ms, sys: 164 ms, total: 1.13 s\n",
      "Wall time: 1.13 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#Попробуем такой вариант, он не будет работать если будут ошибки в формате\n",
    "st = '/'.join(df['Давление'])\n",
    "df[['в.давл.', 'н.давл.']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ускоорение в 8 раз"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Часть 2. Заполнение данных, слайд 19"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1. Сгенерируем похожие данные"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_elements = 5000000\n",
    "train_len = int(0.75 * num_elements)\n",
    "num_nan = 300000\n",
    "df_ = pd.DataFrame(np.random.choice(a = range(60, 100), size = (num_elements, 1)), columns = ['площадь'])\n",
    "df_['data'] = ['train'] * train_len +  ['test'] * (num_elements - train_len)\n",
    "nan_positions = np.random.choice(num_elements, num_nan, replace=False)\n",
    "df_.loc[nan_positions, 'площадь'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>площадь</th>\n",
       "      <th>data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>72.0</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>73.0</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>93.0</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>69.0</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>80.0</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   площадь   data\n",
       "0     72.0  train\n",
       "1     73.0  train\n",
       "2     93.0  train\n",
       "3     69.0  train\n",
       "4     80.0  train"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = df_.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.58 s, sys: 132 ms, total: 2.71 s\n",
      "Wall time: 2.71 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#Оригинальный код\n",
    "df.loc[df['data'] == 'train', 'площадь'] =\\\n",
    "df[df['data'] == 'train']['площадь'].fillna(df[df['data'] == 'train']['площадь'].mean())\n",
    "df.loc[df['data'] == 'test', 'площадь'] =\\\n",
    "df[df['data'] == 'test']['площадь'].fillna(df[df['data'] == 'test']['площадь'].mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = df_.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 849 ms, sys: 304 ms, total: 1.15 s\n",
      "Wall time: 1.15 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#Используем transform\n",
    "gb = df.groupby('data')\n",
    "df['площадь'] = gb.transform(lambda x: x.fillna(x.mean()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ускорение более чем в два раза"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = df_.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Разберемся что сколько времени занимает в коде выше"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 417 ms, sys: 120 ms, total: 537 ms\n",
      "Wall time: 536 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "gb = df.groupby('data')\n",
    "iterate = list(gb)\n",
    "del iterate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 37.6 ms, sys: 15.9 ms, total: 53.6 ms\n",
      "Wall time: 52.4 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "m = gb.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 628 ms, sys: 240 ms, total: 868 ms\n",
      "Wall time: 867 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "df['площадь'] = gb.transform(lambda x: x.fillna(0))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Получается что transform занимает значимую часть времени"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = df_.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 709 ms, sys: 188 ms, total: 897 ms\n",
      "Wall time: 895 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "gb = df.groupby('data')\n",
    "mean = gb.mean()\n",
    "for gn, x in gb:\n",
    "    x['площадь'].fillna(mean.loc[gn])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ускорили еще на 250 мс"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}