{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Глупый градиентный бенчмарк\n", "\n", "\n", "Берём **только** весовые признаки и скармливаем их бустингу" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# подгружаем все нужные пакеты\n", "import pandas as pd\n", "import numpy as np\n", "\n", "\n", "# для встроенных картинок\n", "%pylab inline\n", "# чуть покрасивше картинки:\n", "pd.set_option('display.mpl_style', 'default')\n", "figsize(12, 9)\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "#plt.rcParams['figure.figsize'] = 10, 7.5\n", "#plt.rcParams['axes.grid'] = True\n", "pd.set_option('display.max_columns', None)\n", "import matplotlib.pyplot as plt\n", "\n", "import matplotlib as mpl\n", "mpl.rcParams['font.family'] = 'Ubuntu'\n", "\n", "plt.rc('text', usetex=False)\n", "plt.rc('font', family='serif')\n", "plt.rc('font', weight='bold')\n", "plt.rc('xtick', labelsize=14) \n", "plt.rc('ytick', labelsize=14)\n", "\n", "# чтобы был русский шрифт\n", "from matplotlib import rc\n", " \n", "font = {'family': 'Droid Sans',\n", " 'weight': 'normal'}\n", "rc('font', **font)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from scipy.sparse import csr_matrix\n", "\n", "data = pd.read_csv('train2.csv')\n", "\n", "ss = csr_matrix((data['sum'].values, (data.id.values - 1, data.date.values - 1)))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "((110000,), (110000, 431))\n", "((110000,), (110000, 424))\n" ] } ], "source": [ "def split_train_test(ss):\n", " \"\"\"\n", " разделение на целевой вектор и спарс-матрицу\n", " \"\"\"\n", " y, s = ss[:, -7:], ss[:, :-7]\n", " y = np.array(y.todense())\n", " y = (((y > 0).cumsum(axis=1) == 1) * y).sum(axis=1)\n", " print (y.shape, s.shape)\n", " return y, s\n", " \n", "y, s = split_train_test(ss)\n", "y2, s2 = split_train_test(s)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def get_features(s):\n", " \"\"\"\n", " генерация признаков по спарс-матрице\n", " \"\"\"\n", " m, n = s.shape\n", " k = n % 7\n", " \n", " f = np.zeros((m, 17*24)) # матрица признаков\n", " \n", " \n", " for i in range(m): # отдельно по каждому пользователю\n", " sh = s[i,:].toarray().ravel()\n", " h = sh[k:].reshape(-1, 7)\n", " g = (((h > 0).cumsum(axis=1) == 1) * h).sum(axis=1) # все первые покупки\n", " \n", " q = h.shape[0]\n", " \n", " def get_weighted(w, q, g):\n", " \"\"\"\n", " для создания признаков весовых-схем\n", " w - веса\n", " q = len(g) - можно убрать\n", " g - перечень сумм\n", " \"\"\"\n", " return (np.dot (w, csr_matrix((np.ones(q), (np.arange(q), g)), shape=(q, 17)).toarray()) / sum(w))\n", " \n", " for deg in [0, 1, 2, 3]: # степени в весовой схеме и... индексы\n", " new_features = get_weighted(np.arange(q) ** deg, q, g)\n", " f[i, (17 * deg):(17 * (deg + 1))] = new_features\n", " f[i, (17*6 + 17 * deg):(17*6 + 17 * (deg + 1))] = new_features / max(new_features)\n", " \n", " \n", " # теперь по всем покупкам вообще\n", " l = len(sh)\n", " \n", " for deg in [0, 1, 2, 3]:\n", " new_features = get_weighted(np.arange(l) ** deg, l, sh)\n", " f[i, (17*12 + 17 * deg):(17*12 + 17 * (deg + 1))] = new_features\n", " f[i, (17*18 + 17 * deg):(17*18 + 17 * (deg + 1))] = new_features / max(new_features) \n", " \n", " if mod(i, 10000) == 0:\n", " print (i)\n", " \n", " return (f) " ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "10000\n", "20000\n", "30000\n", "40000\n", "50000\n", "60000\n", "70000\n", "80000\n", "90000\n", "100000\n", "0\n", "10000\n", "20000\n", "30000\n", "40000\n", "50000\n", "60000\n", "70000\n", "80000\n", "90000\n", "100000\n" ] } ], "source": [ "X = get_features(s)\n", "X2 = get_features(s2)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import lightgbm as lgb\n", "from time import time\n", "from sklearn.metrics import log_loss\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.model_selection import cross_val_predict" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "39.3623058796\n" ] } ], "source": [ "e = 0.02 * np.random.randn(*X.shape)\n", "\n", "tm = time()\n", "gbm = lgb.LGBMClassifier(learning_rate=0.1, n_estimators=30, nthread=-1)\n", "gbm.fit(X2 + e, y2)\n", "a = gbm.predict(X)\n", "print (time() - tm)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(u'\\u0442\\u043e\\u0447\\u043d\\u043e\\u0441\\u0442\\u044c', 0.39496363636363635)\n" ] } ], "source": [ "print (u'точность', np.mean(a==y))\n", "# 0.39483636363636365 0.39496363636363635" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "10000\n", "20000\n", "30000\n", "40000\n", "50000\n", "60000\n", "70000\n", "80000\n", "90000\n", "100000\n" ] } ], "source": [ "X0 = get_features(ss)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": true }, "outputs": [], "source": [ "gbm = lgb.LGBMClassifier(learning_rate=0.1, n_estimators=30, nthread=-1)\n", "gbm.fit(X, y)\n", "a = gbm.predict(X0)\n", "pd.DataFrame({'id': np.arange(1, 110001), 'sum':a}).to_csv('lgb_benchmark-1.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [], "source": [ "gbm = lgb.LGBMClassifier(learning_rate=0.1, n_estimators=30, nthread=-1)\n", "XX = np.concatenate([X, X2])\n", "# XX = XX + 0.02 * np.random.randn(*XX.shape)\n", "gbm.fit(XX, np.concatenate([y, y2]))\n", "a = gbm.predict(X0)\n", "\n", "pd.DataFrame({'id': np.arange(1, 110001), 'sum':a}).to_csv('lgb_benchmark-2.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }