{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Выполнил Бибик Денис, ММП." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Оптимизация кода для препроцессинга данных" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Часть 1.Корректировка данных, слайд 13" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Сгенерируем похожие данные" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "num_elenments = 3000000\n", "\n", "measures = [str(np.random.choice(range(100, 200))) + '/' + str(np.random.choice(range(80, 100))) \n", " for i in range(num_elenments)]\n", "df_ = pd.DataFrame(measures, columns = ['Давление'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Давление
0195/87
1116/93
2146/91
3172/84
4147/98
\n", "
" ], "text/plain": [ " Давление\n", "0 195/87\n", "1 116/93\n", "2 146/91\n", "3 172/84\n", "4 147/98" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Перед каждым экспирементом будем копировать датасет" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df_.copy()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 8.89 s, sys: 588 ms, total: 9.48 s\n", "Wall time: 9.47 s\n" ] } ], "source": [ "%%time\n", "#Оригинальный код\n", "tmp = df['Давление'].str.split('/')\n", "df['в.давл.'] = tmp.apply(lambda x: x[0])\n", "df['н.давл.'] = tmp.apply(lambda x: x[1])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df_.copy()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 8.88 s, sys: 644 ms, total: 9.52 s\n", "Wall time: 9.44 s\n" ] } ], "source": [ "%%time\n", "#Попробуем избавится от apply\n", "tmp = df['Давление'].str.split('/').str\n", "df['в.давл.'] = tmp.get(0)\n", "df['н.давл.'] = tmp.get(1)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df_.copy()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 4.82 s, sys: 164 ms, total: 4.99 s\n", "Wall time: 4.99 s\n" ] } ], "source": [ "%%time\n", "#Теперь импользуем встроенную параметр expand\n", "df[['в.давл.', 'н.давл.']] = df['Давление'].str.split('/', expand=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ускорение почти в два раза" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df_.copy()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 963 ms, sys: 164 ms, total: 1.13 s\n", "Wall time: 1.13 s\n" ] } ], "source": [ "%%time\n", "#Попробуем такой вариант, он не будет работать если будут ошибки в формате\n", "st = '/'.join(df['Давление'])\n", "df[['в.давл.', 'н.давл.']] = pd.DataFrame(np.array(st.split('/')).reshape(-1, 2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ускоорение в 8 раз" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Часть 2. Заполнение данных, слайд 19" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Сгенерируем похожие данные" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "num_elements = 5000000\n", "train_len = int(0.75 * num_elements)\n", "num_nan = 300000\n", "df_ = pd.DataFrame(np.random.choice(a = range(60, 100), size = (num_elements, 1)), columns = ['площадь'])\n", "df_['data'] = ['train'] * train_len + ['test'] * (num_elements - train_len)\n", "nan_positions = np.random.choice(num_elements, num_nan, replace=False)\n", "df_.loc[nan_positions, 'площадь'] = np.nan" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
площадьdata
072.0train
173.0train
293.0train
369.0train
480.0train
\n", "
" ], "text/plain": [ " площадь data\n", "0 72.0 train\n", "1 73.0 train\n", "2 93.0 train\n", "3 69.0 train\n", "4 80.0 train" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df_.copy()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.58 s, sys: 132 ms, total: 2.71 s\n", "Wall time: 2.71 s\n" ] } ], "source": [ "%%time\n", "#Оригинальный код\n", "df.loc[df['data'] == 'train', 'площадь'] =\\\n", "df[df['data'] == 'train']['площадь'].fillna(df[df['data'] == 'train']['площадь'].mean())\n", "df.loc[df['data'] == 'test', 'площадь'] =\\\n", "df[df['data'] == 'test']['площадь'].fillna(df[df['data'] == 'test']['площадь'].mean())" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df_.copy()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 849 ms, sys: 304 ms, total: 1.15 s\n", "Wall time: 1.15 s\n" ] } ], "source": [ "%%time\n", "#Используем transform\n", "gb = df.groupby('data')\n", "df['площадь'] = gb.transform(lambda x: x.fillna(x.mean()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ускорение более чем в два раза" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df_.copy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Разберемся что сколько времени занимает в коде выше" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 417 ms, sys: 120 ms, total: 537 ms\n", "Wall time: 536 ms\n" ] } ], "source": [ "%%time\n", "gb = df.groupby('data')\n", "iterate = list(gb)\n", "del iterate" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 37.6 ms, sys: 15.9 ms, total: 53.6 ms\n", "Wall time: 52.4 ms\n" ] } ], "source": [ "%%time\n", "m = gb.mean()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 628 ms, sys: 240 ms, total: 868 ms\n", "Wall time: 867 ms\n" ] } ], "source": [ "%%time\n", "df['площадь'] = gb.transform(lambda x: x.fillna(0))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Получается что transform занимает значимую часть времени" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = df_.copy()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 709 ms, sys: 188 ms, total: 897 ms\n", "Wall time: 895 ms\n" ] } ], "source": [ "%%time\n", "gb = df.groupby('data')\n", "mean = gb.mean()\n", "for gn, x in gb:\n", " x['площадь'].fillna(mean.loc[gn])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ускорили еще на 250 мс" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" } }, "nbformat": 4, "nbformat_minor": 2 }