{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "进行较为复杂的ensemble方法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "from bs4 import BeautifulSoup\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from gensim.models.doc2vec import TaggedDocument\n",
    "\n",
    "\n",
    "def review_to_words(raw_review):\n",
    "    review_text = BeautifulSoup(raw_review, 'lxml').get_text()\n",
    "    letters_only = re.sub(\"[^a-zA-Z]\", \" \", review_text) \n",
    "    words = letters_only.lower().split()\n",
    "    stops = set(stopwords.words(\"english\"))\n",
    "    meaningful_words = [w for w in words if not w in stops]\n",
    "    return(\" \".join(meaningful_words))\n",
    "\n",
    "\n",
    "def tag_reviews(reviews, prefix):\n",
    "    tagged = []\n",
    "    for i, review in enumerate(reviews):\n",
    "        tagged.append(TaggedDocument(words=review.split(), tags=[prefix + '_%s' % i]))\n",
    "    return tagged"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cleaning and parsing the training set movie reviews...\n",
      "Cleaning and parsing the test set movie reviews...\n",
      "Cleaning and parsing the test set movie reviews...\n",
      "Review 5000 of 50000\n",
      "\n",
      "Review 10000 of 50000\n",
      "\n",
      "Review 15000 of 50000\n",
      "\n",
      "Review 20000 of 50000\n",
      "\n",
      "Review 25000 of 50000\n",
      "\n",
      "Review 30000 of 50000\n",
      "\n",
      "Review 35000 of 50000\n",
      "\n",
      "Review 40000 of 50000\n",
      "\n",
      "Review 45000 of 50000\n",
      "\n",
      "Review 50000 of 50000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# gensim modules\n",
    "from gensim.models import Doc2Vec\n",
    "\n",
    "# numpy\n",
    "import numpy as np\n",
    "\n",
    "# classifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "# random\n",
    "from random import shuffle\n",
    "\n",
    "# preprocess packages\n",
    "import pandas as pd\n",
    "# import sys\n",
    "# sys.path.insert(0, '..')\n",
    "# from utils.TextPreprocess import review_to_words, tag_reviews\n",
    "\n",
    "\n",
    "'''\n",
    "Training Data\n",
    "'''\n",
    "train = pd.read_csv(\"../Sentiment/data/labeledTrainData.tsv\", header=0, \n",
    "                         delimiter='\\t', quoting=3, error_bad_lines=False)\n",
    "num_reviews = train[\"review\"].size\n",
    "\n",
    "print(\"Cleaning and parsing the training set movie reviews...\")\n",
    "clean_train_reviews = []\n",
    "for i in range(0, num_reviews):\n",
    "    clean_train_reviews.append(review_to_words(train[\"review\"][i]))\n",
    "\n",
    "'''\n",
    "Test Data\n",
    "'''\n",
    "test = pd.read_csv(\"../Sentiment/data/testData.tsv\", header = 0, delimiter = \"\\t\", quoting = 3)\n",
    "\n",
    "num_reviews = len(test[\"review\"])\n",
    "clean_test_reviews = []\n",
    "\n",
    "print(\"Cleaning and parsing the test set movie reviews...\")\n",
    "for i in range(0, num_reviews):\n",
    "    clean_review = review_to_words(test[\"review\"][i])\n",
    "    clean_test_reviews.append(clean_review)\n",
    "\n",
    "\n",
    "# # Unlabeled Train Data\n",
    "# unlabeled_reviews = pd.read_csv(\"../Sentiment/data/unlabeledTrainData.tsv\", header = 0, delimiter = \"\\t\", quoting = 3)\n",
    "# num_reviews = len(unlabeled_reviews[\"review\"])\n",
    "# clean_unlabeled_reviews = []\n",
    "\n",
    "# print(\"Cleaning and parsing the test set movie reviews...\")\n",
    "# for i in range( 0, num_reviews):\n",
    "#     if( (i+1)%5000 == 0 ):\n",
    "#         print(\"Review %d of %d\\n\" % (i+1, num_reviews))\n",
    "#     clean_review = review_to_words(unlabeled_reviews[\"review\"][i])\n",
    "#     clean_unlabeled_reviews.append(clean_review)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "把训练好的doc2vec模型导入，得到train和test的sentence vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_data_features_d2v = []\n",
    "test_data_features_d2v = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model_dbow = Doc2Vec.load('../Sentiment/src/deep/model/doc2vec_lr100')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "呃，发现还需要train_tagged这样有tag信息的对象才能读取。我还是直接把处理好的vector保存好得了。在Part 2.9进行保存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_data_features_d2v = np.loadtxt('../Sentiment/data/train_feature_d2v.txt')\n",
    "test_data_features_d2v = np.loadtxt('../Sentiment/data/test_feature_d2v.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from scipy.sparse import bsr_matrix\n",
    "from sklearn.svm import SVC\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "num_reviews = len(test[\"review\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "result = [0.0 for i in range(num_reviews)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25000"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "def sample(train_bow, train_d2v, label):\n",
    "    num = len(label)\n",
    "    index_set = set(random.sample(range(num), int(num / 2)))\n",
    "    \n",
    "    l1_train_bow = []\n",
    "    l1_train_d2v = []\n",
    "    l1_label = []\n",
    "    \n",
    "    l2_train_bow = []\n",
    "    l2_train_d2v = []\n",
    "    l2_label = []\n",
    "    \n",
    "    for i in range(num):\n",
    "        if i in index_set:\n",
    "            l1_train_bow.append(train_bow[i])\n",
    "            l1_train_d2v.append(train_d2v[i])\n",
    "            l1_label.append(label[i])\n",
    "        else:\n",
    "            l2_train_bow.append(train_bow[i])\n",
    "            l2_train_d2v.append(train_d2v[i])\n",
    "            l2_label.append(label[i])\n",
    "    \n",
    "    return l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "25000\n",
      "(25000, 100)\n",
      "25000\n"
     ]
    }
   ],
   "source": [
    "print(len(clean_train_reviews))\n",
    "print(train_data_features_d2v.shape)\n",
    "print(len(train[\"sentiment\"].values))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample(clean_train_reviews, train_data_features_d2v, train[\"sentiment\"].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12500\n",
      "12500\n",
      "12500\n",
      "12500\n",
      "12500\n",
      "12500\n"
     ]
    }
   ],
   "source": [
    "print(len(l1_train_bow))\n",
    "print(len(l1_train_d2v))\n",
    "print(len(l2_train_bow))\n",
    "print(len(l2_train_d2v))\n",
    "print(len(l1_label))\n",
    "print(len(l2_label))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "我想搞清楚这个sample()函数究竟在干什么。\n",
    "\n",
    "明白了，`random.sample(range(25000), 12500)`，其实就是从25000个数字里，随机调出12500个。这里又多加了个set，感觉有点多余"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "num = len(train[\"sentiment\"].values) # num = 25000\n",
    "index_set = set(random.sample(range(num), int(num / 2)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "明白了，整个sample函数，其实就是把12500个训练集平均分成了两部分，训练集相关数据有clean_train_reviews（实际的sentence），train_data_features_d2v（经过doc2vec处理的sentence vector）。对应的标签也分成了两部分。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import heapq \n",
    "\n",
    "def select_feature(filePath, k):\n",
    "\tread = open(filePath, 'r')\n",
    "\tlab_fea = {}\n",
    "\t\n",
    "\tfor line in read:\n",
    "\t\tline_arr = line.strip().split()\n",
    "\t\tif len(line_arr) - 1 <= k:\n",
    "\t\t\tlab_fea[line_arr[0]] = [kv.split(':')[0] for kv in line_arr[1 : ]]\n",
    "\t\telse:\n",
    "\t\t\theap = []\n",
    "\t\t\theapq.heapify(heap)\n",
    "\t\t\tfor kv in line_arr[1 : ]:\n",
    "\t\t\t\tkey, val = kv.split(':')\n",
    "\t\t\t\tif len(heap) < k:\n",
    "\t\t\t\t\theapq.heappush(heap, (float(val), key))\n",
    "\t\t\t\telse:\n",
    "\t\t\t\t\tif float(val) > heap[0][0]:\n",
    "\t\t\t\t\t\theapq.heappop(heap)\n",
    "\t\t\t\t\t\theapq.heappush(heap, (float(val), key))\n",
    "\t\t\tlab_fea[line_arr[0]] = [heapq.heappop(heap)[1] for i in range(len(heap))]\n",
    "\tread.close()\n",
    "\treturn lab_fea\n",
    "\n",
    "lab_fea = select_feature('feature_chi.txt', 1000)['1']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "我们一般处理方式有2种：\n",
    "1）对数据先fit，再transform，好处是我可以拿到数据变换(比如scaling/幅度变换/标准化)的参数，这样你可以在测试集上也一样做相同的数据变换处理。即先对训练集做fit，然后再对训练集和测试集做transform\n",
    "2）fit_trainsform，一次性完成数据的变换(比如scaling/幅度变换/标准化)，比较快。但是如果在训练集和测试集上用fit_trainsform，可能执行的是两套变换标准(因为训练集和测试集幅度不一样)\n",
    "\n",
    "[这个解释](https://datascience.stackexchange.com/questions/12321/difference-between-fit-and-fit-transform-in-scikit-learn-models)的也很清楚，transform主要就是为了做中心化之类的预处理操作，让数据更好用一些。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training bow ...\n"
     ]
    }
   ],
   "source": [
    "print(\"training bow ...\")\n",
    "vectorizer_bow = TfidfVectorizer(analyzer = \"word\",\n",
    "                                 tokenizer = None,\n",
    "                                 preprocessor = None,\n",
    "                                 stop_words = None,\n",
    "                                 vocabulary = lab_fea,\n",
    "                                 max_features = 19000)\n",
    "\n",
    "l1_train_features_bow = vectorizer_bow.fit_transform(l1_train_bow)\n",
    "l1_train_features_bow = bsr_matrix(l1_train_features_bow)\n",
    "\n",
    "l1_lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) \n",
    "l1_lr_bow = l1_lr_bow.fit(l1_train_features_bow, l1_label)\n",
    "\n",
    "l2_test_features_bow = vectorizer_bow.transform(l2_train_bow)\n",
    "l2_test_features_bow = bsr_matrix(l2_test_features_bow)\n",
    "\n",
    "l2_result_bow = l1_lr_bow.predict_proba(l2_test_features_bow)[:,1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "上面是先训练了一个TfidfVectorizer，对l1_train_bow（即12500个sentence）进行计算得到了l1_train_features_bow（代表每个sentence的特征向量，每个sentence 1000维）。然后用LR对(l1_train_features_bow, l1_label)进行了训练。然后把训练好的模型，对l2_train_bow（l2_test_features_bow）进行了预测。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(12500, 1000)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l1_train_features_bow.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train doc2vec ...\n"
     ]
    }
   ],
   "source": [
    "print(\"train doc2vec ...\")\n",
    "    \n",
    "l1_train_features_d2v = bsr_matrix(l1_train_d2v)\n",
    "l2_test_features_d2v = bsr_matrix(l2_train_d2v)\n",
    "\n",
    "l1_svm_d2v = SVC(C = 1.0, kernel='rbf', gamma = 'auto', probability=True)\n",
    "l1_svm_d2v = l1_svm_d2v.fit(l1_train_features_d2v, l1_label)\n",
    "\n",
    "l2_result_d2v = l1_svm_d2v.predict_proba(l2_test_features_d2v)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(12500,)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l2_result_d2v.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "上面也是，只拿了12500个doc2vec向量，l1_train_d2v，来做训练，分类器是svm，然后对l2_train_d2v进行了预测。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train ensemble ...\n"
     ]
    }
   ],
   "source": [
    "print(\"train ensemble ...\")\n",
    "    \n",
    "train_data_features_ens = []\n",
    "\n",
    "for i in range(len(l2_result_bow)):\n",
    "    vector = []\n",
    "    vector.append(l2_result_bow[i])\n",
    "    vector.append(l2_result_d2v[i])\n",
    "\n",
    "    train_data_features_ens.append(vector)\n",
    "\n",
    "lr_ens = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)\n",
    "lr_ens = lr_ens.fit(train_data_features_ens, l2_label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[0.23886548792325121, 0.15337969425958606],\n",
       " [0.81253080751969953, 0.88316226120104124]]"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data_features_ens[:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这里的一个vector包含两个数字`[l2_result_bow[i], l2_result_d2v[i]]`，所以这里我们得到的train_data_features_ens大概是这样的一个形式`[[l2_result_bow[0], l2_result_d2v[0]], [l2_result_bow[1], l2_result_d2v[1]]]`，写成数字就是上面那样的输出。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "final predict ...\n"
     ]
    }
   ],
   "source": [
    "print(\"final predict ...\")\n",
    "train_bow = vectorizer_bow.fit_transform(clean_train_reviews)\n",
    "train_bow = bsr_matrix(train_bow)\n",
    "\n",
    "test_bow = vectorizer_bow.transform(clean_test_reviews)\n",
    "test_bow = bsr_matrix(test_bow)\n",
    "\n",
    "train_d2v = bsr_matrix(train_data_features_d2v)\n",
    "test_d2v = bsr_matrix(test_data_features_d2v)\n",
    "\n",
    "lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)\n",
    "lr_bow = lr_bow.fit(train_bow, list(train[\"sentiment\"]))\n",
    "\n",
    "svm_d2v = SVC(C = 1.0, kernel='rbf', gamma = 'auto', probability=True)\n",
    "svm_d2v = svm_d2v.fit(train_d2v, train[\"sentiment\"].values)\n",
    "\n",
    "result_bow = lr_bow.predict_proba(test_bow)[:,1]\n",
    "result_d2v = svm_d2v.predict_proba(test_d2v)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_data_features_ens = []\n",
    "    \n",
    "for i in xrange(len(result_bow)):\n",
    "    vector = []\n",
    "    vector.append(result_bow[i])\n",
    "    vector.append(result_d2v[i])\n",
    "\n",
    "    test_data_features_ens.append(vector)\n",
    "\n",
    "result_test_ens = lr_ens.predict_proba(test_data_features_ens)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[0.99794119998088815, 0.99910189748012901],\n",
       " [0.018351063999300796, 0.0014584329684225311]]"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data_features_ens[:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "上面所有都结束后，就算是一次epoch结束了。之后应该把结果都加到result里，然后除以epoch次数，得到平均预测概率。感觉从31开始就有点看不懂了。下面把所有的都内容都完成写一遍，为了加快速度，把svc变为lr："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25000"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = [0.0 for i in range(num_reviews)]\n",
    "len(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0\n",
      "training bow ...\n",
      "train doc2vec ...\n",
      "train ensemble ...\n",
      "final predict ...\n",
      "epoch: 1\n",
      "training bow ...\n",
      "train doc2vec ...\n",
      "train ensemble ...\n",
      "final predict ...\n",
      "epoch: 2\n",
      "training bow ...\n",
      "train doc2vec ...\n",
      "train ensemble ...\n",
      "final predict ...\n",
      "epoch: 3\n",
      "training bow ...\n",
      "train doc2vec ...\n",
      "train ensemble ...\n",
      "final predict ...\n",
      "epoch: 4\n",
      "training bow ...\n",
      "train doc2vec ...\n",
      "train ensemble ...\n",
      "final predict ...\n"
     ]
    }
   ],
   "source": [
    "max_iter = 5\n",
    "for epoch in range(max_iter):\n",
    "    print(\"epoch: \" + str(epoch))\n",
    "    l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample(clean_train_reviews, train_data_features_d2v, train[\"sentiment\"].values)\n",
    "    \n",
    "    \n",
    "    print(\"training bow ...\")\n",
    "    vectorizer_bow = TfidfVectorizer(analyzer = \"word\",\n",
    "                                     tokenizer = None,\n",
    "                                     preprocessor = None,\n",
    "                                     stop_words = None,\n",
    "                                     vocabulary = lab_fea,\n",
    "                                     max_features = 19000)\n",
    "\n",
    "    l1_train_features_bow = vectorizer_bow.fit_transform(l1_train_bow)\n",
    "    l1_train_features_bow = bsr_matrix(l1_train_features_bow)\n",
    "\n",
    "    l1_lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) \n",
    "    l1_lr_bow = l1_lr_bow.fit(l1_train_features_bow, l1_label)\n",
    "\n",
    "    l2_test_features_bow = vectorizer_bow.transform(l2_train_bow)\n",
    "    l2_test_features_bow = bsr_matrix(l2_test_features_bow)\n",
    "\n",
    "    l2_result_bow = l1_lr_bow.predict_proba(l2_test_features_bow)[:,1]\n",
    "    \n",
    "    \n",
    "    print(\"train doc2vec ...\")\n",
    "    l1_train_features_d2v = bsr_matrix(l1_train_d2v)\n",
    "    l2_test_features_d2v = bsr_matrix(l2_train_d2v)\n",
    "\n",
    "    l1_svm_d2v = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) \n",
    "    l1_svm_d2v = l1_svm_d2v.fit(l1_train_features_d2v, l1_label)\n",
    "\n",
    "    l2_result_d2v = l1_svm_d2v.predict_proba(l2_test_features_d2v)[:,1]\n",
    "    \n",
    "    \n",
    "    print(\"train ensemble ...\")\n",
    "    train_data_features_ens = []\n",
    "\n",
    "    for i in range(len(l2_result_bow)):\n",
    "        vector = []\n",
    "        vector.append(l2_result_bow[i])\n",
    "        vector.append(l2_result_d2v[i])\n",
    "\n",
    "        train_data_features_ens.append(vector)\n",
    "\n",
    "    lr_ens = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)\n",
    "    lr_ens = lr_ens.fit(train_data_features_ens, l2_label)\n",
    "    \n",
    "    \n",
    "    print(\"final predict ...\")\n",
    "    train_bow = vectorizer_bow.fit_transform(clean_train_reviews)\n",
    "    train_bow = bsr_matrix(train_bow)\n",
    "\n",
    "    test_bow = vectorizer_bow.transform(clean_test_reviews)\n",
    "    test_bow = bsr_matrix(test_bow)\n",
    "\n",
    "    train_d2v = bsr_matrix(train_data_features_d2v)\n",
    "    test_d2v = bsr_matrix(test_data_features_d2v)\n",
    "\n",
    "    lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)\n",
    "    lr_bow = lr_bow.fit(train_bow, list(train[\"sentiment\"]))\n",
    "\n",
    "    svm_d2v = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)\n",
    "    svm_d2v = svm_d2v.fit(train_d2v, train[\"sentiment\"].values)\n",
    "\n",
    "    result_bow = lr_bow.predict_proba(test_bow)[:,1]\n",
    "    result_d2v = svm_d2v.predict_proba(test_d2v)[:,1]\n",
    "    \n",
    "    test_data_features_ens = []\n",
    "    \n",
    "    for i in range(len(result_bow)):\n",
    "        vector = []\n",
    "        vector.append(result_bow[i])\n",
    "        vector.append(result_d2v[i])\n",
    "\n",
    "        test_data_features_ens.append(vector)\n",
    "\n",
    "    result_test_ens = lr_ens.predict_proba(test_data_features_ens)[:,1]\n",
    "    \n",
    "    for i in range(num_reviews):\n",
    "        result[i] += result_test_ens[i]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对5次的结果取平均"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for i in range(num_reviews):\n",
    "    result[i] /= max_iter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "result = np.array(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.97450031,  0.02570378,  0.57902932, ...,  0.05740643,\n",
       "        0.97011312,  0.65017741])"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ True, False,  True, ..., False,  True,  True], dtype=bool)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result > 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "result_bool = result >= 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0, 1, ..., 0, 1, 1])"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result_bool * 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>\"12311_10\"</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\"8348_2\"</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\"5828_4\"</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\"7186_2\"</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\"12128_7\"</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           id  sentiment\n",
       "0  \"12311_10\"          1\n",
       "1    \"8348_2\"          0\n",
       "2    \"5828_4\"          1\n",
       "3    \"7186_2\"          0\n",
       "4   \"12128_7\"          1"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combine = pd.DataFrame(data={'id': test['id'], \n",
    "                             'sentiment': result_bool * 1})\n",
    "combine.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "output...\n"
     ]
    }
   ],
   "source": [
    "print(\"output...\")\n",
    "combine.to_csv('../Sentiment/result/ensemble.csv', index=False, quoting=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "最后的结果是0.88968，我不知道作者是怎么得到0.96的，反正这样的结果也只是和combine一样罢了。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_combine = pd.read_csv('../Sentiment/result/ensemble_final.csv', header=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12311_10</td>\n",
       "      <td>0.914962</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8348_2</td>\n",
       "      <td>0.063295</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5828_4</td>\n",
       "      <td>0.940739</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7186_2</td>\n",
       "      <td>0.134307</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12128_7</td>\n",
       "      <td>0.924105</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         id  sentiment\n",
       "0  12311_10   0.914962\n",
       "1    8348_2   0.063295\n",
       "2    5828_4   0.940739\n",
       "3    7186_2   0.134307\n",
       "4   12128_7   0.924105"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_combine.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test_combine['sentiment'] = test_combine['sentiment'] >= 0.5\n",
    "test_combine['sentiment'] = test_combine['sentiment'].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12311_10</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8348_2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5828_4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7186_2</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12128_7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         id  sentiment\n",
       "0  12311_10          1\n",
       "1    8348_2          0\n",
       "2    5828_4          1\n",
       "3    7186_2          0\n",
       "4   12128_7          1"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_combine.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "output...\n"
     ]
    }
   ],
   "source": [
    "print(\"output...\")\n",
    "test_combine.to_csv('../Sentiment/result/test_combine.csv', index=False, quoting=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "呃……上面是作者原文件里的ensemble_final，我提交后也就0.89的程度……"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [py35]",
   "language": "python",
   "name": "Python [py35]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}