{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "进行较为复杂的ensemble方法" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] } ], "source": [ "import re\n", "from bs4 import BeautifulSoup\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from gensim.models.doc2vec import TaggedDocument\n", "\n", "\n", "def review_to_words(raw_review):\n", " review_text = BeautifulSoup(raw_review, 'lxml').get_text()\n", " letters_only = re.sub(\"[^a-zA-Z]\", \" \", review_text) \n", " words = letters_only.lower().split()\n", " stops = set(stopwords.words(\"english\"))\n", " meaningful_words = [w for w in words if not w in stops]\n", " return(\" \".join(meaningful_words))\n", "\n", "\n", "def tag_reviews(reviews, prefix):\n", " tagged = []\n", " for i, review in enumerate(reviews):\n", " tagged.append(TaggedDocument(words=review.split(), tags=[prefix + '_%s' % i]))\n", " return tagged" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cleaning and parsing the training set movie reviews...\n", "Cleaning and parsing the test set movie reviews...\n", "Cleaning and parsing the test set movie reviews...\n", "Review 5000 of 50000\n", "\n", "Review 10000 of 50000\n", "\n", "Review 15000 of 50000\n", "\n", "Review 20000 of 50000\n", "\n", "Review 25000 of 50000\n", "\n", "Review 30000 of 50000\n", "\n", "Review 35000 of 50000\n", "\n", "Review 40000 of 50000\n", "\n", "Review 45000 of 50000\n", "\n", "Review 50000 of 50000\n", "\n" ] } ], "source": [ "# gensim modules\n", "from gensim.models import Doc2Vec\n", "\n", "# numpy\n", "import numpy as np\n", "\n", "# classifier\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "# random\n", "from random import shuffle\n", "\n", "# preprocess packages\n", "import pandas as pd\n", "# import sys\n", "# sys.path.insert(0, '..')\n", "# from utils.TextPreprocess import review_to_words, tag_reviews\n", "\n", "\n", "'''\n", "Training Data\n", "'''\n", "train = pd.read_csv(\"../Sentiment/data/labeledTrainData.tsv\", header=0, \n", " delimiter='\\t', quoting=3, error_bad_lines=False)\n", "num_reviews = train[\"review\"].size\n", "\n", "print(\"Cleaning and parsing the training set movie reviews...\")\n", "clean_train_reviews = []\n", "for i in range(0, num_reviews):\n", " clean_train_reviews.append(review_to_words(train[\"review\"][i]))\n", "\n", "'''\n", "Test Data\n", "'''\n", "test = pd.read_csv(\"../Sentiment/data/testData.tsv\", header = 0, delimiter = \"\\t\", quoting = 3)\n", "\n", "num_reviews = len(test[\"review\"])\n", "clean_test_reviews = []\n", "\n", "print(\"Cleaning and parsing the test set movie reviews...\")\n", "for i in range(0, num_reviews):\n", " clean_review = review_to_words(test[\"review\"][i])\n", " clean_test_reviews.append(clean_review)\n", "\n", "\n", "# # Unlabeled Train Data\n", "# unlabeled_reviews = pd.read_csv(\"../Sentiment/data/unlabeledTrainData.tsv\", header = 0, delimiter = \"\\t\", quoting = 3)\n", "# num_reviews = len(unlabeled_reviews[\"review\"])\n", "# clean_unlabeled_reviews = []\n", "\n", "# print(\"Cleaning and parsing the test set movie reviews...\")\n", "# for i in range( 0, num_reviews):\n", "# if( (i+1)%5000 == 0 ):\n", "# print(\"Review %d of %d\\n\" % (i+1, num_reviews))\n", "# clean_review = review_to_words(unlabeled_reviews[\"review\"][i])\n", "# clean_unlabeled_reviews.append(clean_review)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "把训练好的doc2vec模型导入,得到train和test的sentence vector" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "train_data_features_d2v = []\n", "test_data_features_d2v = []" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "model_dbow = Doc2Vec.load('../Sentiment/src/deep/model/doc2vec_lr100')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "呃,发现还需要train_tagged这样有tag信息的对象才能读取。我还是直接把处理好的vector保存好得了。在Part 2.9进行保存" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "train_data_features_d2v = np.loadtxt('../Sentiment/data/train_feature_d2v.txt')\n", "test_data_features_d2v = np.loadtxt('../Sentiment/data/test_feature_d2v.txt')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from scipy.sparse import bsr_matrix\n", "from sklearn.svm import SVC\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "num_reviews = len(test[\"review\"])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "result = [0.0 for i in range(num_reviews)]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "25000" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(result)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import random\n", "\n", "def sample(train_bow, train_d2v, label):\n", " num = len(label)\n", " index_set = set(random.sample(range(num), int(num / 2)))\n", " \n", " l1_train_bow = []\n", " l1_train_d2v = []\n", " l1_label = []\n", " \n", " l2_train_bow = []\n", " l2_train_d2v = []\n", " l2_label = []\n", " \n", " for i in range(num):\n", " if i in index_set:\n", " l1_train_bow.append(train_bow[i])\n", " l1_train_d2v.append(train_d2v[i])\n", " l1_label.append(label[i])\n", " else:\n", " l2_train_bow.append(train_bow[i])\n", " l2_train_d2v.append(train_d2v[i])\n", " l2_label.append(label[i])\n", " \n", " return l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "25000\n", "(25000, 100)\n", "25000\n" ] } ], "source": [ "print(len(clean_train_reviews))\n", "print(train_data_features_d2v.shape)\n", "print(len(train[\"sentiment\"].values))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample(clean_train_reviews, train_data_features_d2v, train[\"sentiment\"].values)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "12500\n", "12500\n", "12500\n", "12500\n", "12500\n", "12500\n" ] } ], "source": [ "print(len(l1_train_bow))\n", "print(len(l1_train_d2v))\n", "print(len(l2_train_bow))\n", "print(len(l2_train_d2v))\n", "print(len(l1_label))\n", "print(len(l2_label))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "我想搞清楚这个sample()函数究竟在干什么。\n", "\n", "明白了,`random.sample(range(25000), 12500)`,其实就是从25000个数字里,随机调出12500个。这里又多加了个set,感觉有点多余" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [], "source": [ "num = len(train[\"sentiment\"].values) # num = 25000\n", "index_set = set(random.sample(range(num), int(num / 2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "明白了,整个sample函数,其实就是把12500个训练集平均分成了两部分,训练集相关数据有clean_train_reviews(实际的sentence),train_data_features_d2v(经过doc2vec处理的sentence vector)。对应的标签也分成了两部分。" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import heapq \n", "\n", "def select_feature(filePath, k):\n", "\tread = open(filePath, 'r')\n", "\tlab_fea = {}\n", "\t\n", "\tfor line in read:\n", "\t\tline_arr = line.strip().split()\n", "\t\tif len(line_arr) - 1 <= k:\n", "\t\t\tlab_fea[line_arr[0]] = [kv.split(':')[0] for kv in line_arr[1 : ]]\n", "\t\telse:\n", "\t\t\theap = []\n", "\t\t\theapq.heapify(heap)\n", "\t\t\tfor kv in line_arr[1 : ]:\n", "\t\t\t\tkey, val = kv.split(':')\n", "\t\t\t\tif len(heap) < k:\n", "\t\t\t\t\theapq.heappush(heap, (float(val), key))\n", "\t\t\t\telse:\n", "\t\t\t\t\tif float(val) > heap[0][0]:\n", "\t\t\t\t\t\theapq.heappop(heap)\n", "\t\t\t\t\t\theapq.heappush(heap, (float(val), key))\n", "\t\t\tlab_fea[line_arr[0]] = [heapq.heappop(heap)[1] for i in range(len(heap))]\n", "\tread.close()\n", "\treturn lab_fea\n", "\n", "lab_fea = select_feature('feature_chi.txt', 1000)['1']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "我们一般处理方式有2种:\n", "1)对数据先fit,再transform,好处是我可以拿到数据变换(比如scaling/幅度变换/标准化)的参数,这样你可以在测试集上也一样做相同的数据变换处理。即先对训练集做fit,然后再对训练集和测试集做transform\n", "2)fit_trainsform,一次性完成数据的变换(比如scaling/幅度变换/标准化),比较快。但是如果在训练集和测试集上用fit_trainsform,可能执行的是两套变换标准(因为训练集和测试集幅度不一样)\n", "\n", "[这个解释](https://datascience.stackexchange.com/questions/12321/difference-between-fit-and-fit-transform-in-scikit-learn-models)的也很清楚,transform主要就是为了做中心化之类的预处理操作,让数据更好用一些。" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "training bow ...\n" ] } ], "source": [ "print(\"training bow ...\")\n", "vectorizer_bow = TfidfVectorizer(analyzer = \"word\",\n", " tokenizer = None,\n", " preprocessor = None,\n", " stop_words = None,\n", " vocabulary = lab_fea,\n", " max_features = 19000)\n", "\n", "l1_train_features_bow = vectorizer_bow.fit_transform(l1_train_bow)\n", "l1_train_features_bow = bsr_matrix(l1_train_features_bow)\n", "\n", "l1_lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) \n", "l1_lr_bow = l1_lr_bow.fit(l1_train_features_bow, l1_label)\n", "\n", "l2_test_features_bow = vectorizer_bow.transform(l2_train_bow)\n", "l2_test_features_bow = bsr_matrix(l2_test_features_bow)\n", "\n", "l2_result_bow = l1_lr_bow.predict_proba(l2_test_features_bow)[:,1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "上面是先训练了一个TfidfVectorizer,对l1_train_bow(即12500个sentence)进行计算得到了l1_train_features_bow(代表每个sentence的特征向量,每个sentence 1000维)。然后用LR对(l1_train_features_bow, l1_label)进行了训练。然后把训练好的模型,对l2_train_bow(l2_test_features_bow)进行了预测。" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(12500, 1000)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "l1_train_features_bow.shape" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train doc2vec ...\n" ] } ], "source": [ "print(\"train doc2vec ...\")\n", " \n", "l1_train_features_d2v = bsr_matrix(l1_train_d2v)\n", "l2_test_features_d2v = bsr_matrix(l2_train_d2v)\n", "\n", "l1_svm_d2v = SVC(C = 1.0, kernel='rbf', gamma = 'auto', probability=True)\n", "l1_svm_d2v = l1_svm_d2v.fit(l1_train_features_d2v, l1_label)\n", "\n", "l2_result_d2v = l1_svm_d2v.predict_proba(l2_test_features_d2v)[:,1]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(12500,)" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "l2_result_d2v.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "上面也是,只拿了12500个doc2vec向量,l1_train_d2v,来做训练,分类器是svm,然后对l2_train_d2v进行了预测。" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train ensemble ...\n" ] } ], "source": [ "print(\"train ensemble ...\")\n", " \n", "train_data_features_ens = []\n", "\n", "for i in range(len(l2_result_bow)):\n", " vector = []\n", " vector.append(l2_result_bow[i])\n", " vector.append(l2_result_d2v[i])\n", "\n", " train_data_features_ens.append(vector)\n", "\n", "lr_ens = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)\n", "lr_ens = lr_ens.fit(train_data_features_ens, l2_label)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[[0.23886548792325121, 0.15337969425958606],\n", " [0.81253080751969953, 0.88316226120104124]]" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_data_features_ens[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "这里的一个vector包含两个数字`[l2_result_bow[i], l2_result_d2v[i]]`,所以这里我们得到的train_data_features_ens大概是这样的一个形式`[[l2_result_bow[0], l2_result_d2v[0]], [l2_result_bow[1], l2_result_d2v[1]]]`,写成数字就是上面那样的输出。" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "final predict ...\n" ] } ], "source": [ "print(\"final predict ...\")\n", "train_bow = vectorizer_bow.fit_transform(clean_train_reviews)\n", "train_bow = bsr_matrix(train_bow)\n", "\n", "test_bow = vectorizer_bow.transform(clean_test_reviews)\n", "test_bow = bsr_matrix(test_bow)\n", "\n", "train_d2v = bsr_matrix(train_data_features_d2v)\n", "test_d2v = bsr_matrix(test_data_features_d2v)\n", "\n", "lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)\n", "lr_bow = lr_bow.fit(train_bow, list(train[\"sentiment\"]))\n", "\n", "svm_d2v = SVC(C = 1.0, kernel='rbf', gamma = 'auto', probability=True)\n", "svm_d2v = svm_d2v.fit(train_d2v, train[\"sentiment\"].values)\n", "\n", "result_bow = lr_bow.predict_proba(test_bow)[:,1]\n", "result_d2v = svm_d2v.predict_proba(test_d2v)[:,1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "test_data_features_ens = []\n", " \n", "for i in xrange(len(result_bow)):\n", " vector = []\n", " vector.append(result_bow[i])\n", " vector.append(result_d2v[i])\n", "\n", " test_data_features_ens.append(vector)\n", "\n", "result_test_ens = lr_ens.predict_proba(test_data_features_ens)[:,1]" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[[0.99794119998088815, 0.99910189748012901],\n", " [0.018351063999300796, 0.0014584329684225311]]" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_data_features_ens[:2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "上面所有都结束后,就算是一次epoch结束了。之后应该把结果都加到result里,然后除以epoch次数,得到平均预测概率。感觉从31开始就有点看不懂了。下面把所有的都内容都完成写一遍,为了加快速度,把svc变为lr:" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "25000" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = [0.0 for i in range(num_reviews)]\n", "len(result)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "epoch: 0\n", "training bow ...\n", "train doc2vec ...\n", "train ensemble ...\n", "final predict ...\n", "epoch: 1\n", "training bow ...\n", "train doc2vec ...\n", "train ensemble ...\n", "final predict ...\n", "epoch: 2\n", "training bow ...\n", "train doc2vec ...\n", "train ensemble ...\n", "final predict ...\n", "epoch: 3\n", "training bow ...\n", "train doc2vec ...\n", "train ensemble ...\n", "final predict ...\n", "epoch: 4\n", "training bow ...\n", "train doc2vec ...\n", "train ensemble ...\n", "final predict ...\n" ] } ], "source": [ "max_iter = 5\n", "for epoch in range(max_iter):\n", " print(\"epoch: \" + str(epoch))\n", " l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample(clean_train_reviews, train_data_features_d2v, train[\"sentiment\"].values)\n", " \n", " \n", " print(\"training bow ...\")\n", " vectorizer_bow = TfidfVectorizer(analyzer = \"word\",\n", " tokenizer = None,\n", " preprocessor = None,\n", " stop_words = None,\n", " vocabulary = lab_fea,\n", " max_features = 19000)\n", "\n", " l1_train_features_bow = vectorizer_bow.fit_transform(l1_train_bow)\n", " l1_train_features_bow = bsr_matrix(l1_train_features_bow)\n", "\n", " l1_lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) \n", " l1_lr_bow = l1_lr_bow.fit(l1_train_features_bow, l1_label)\n", "\n", " l2_test_features_bow = vectorizer_bow.transform(l2_train_bow)\n", " l2_test_features_bow = bsr_matrix(l2_test_features_bow)\n", "\n", " l2_result_bow = l1_lr_bow.predict_proba(l2_test_features_bow)[:,1]\n", " \n", " \n", " print(\"train doc2vec ...\")\n", " l1_train_features_d2v = bsr_matrix(l1_train_d2v)\n", " l2_test_features_d2v = bsr_matrix(l2_train_d2v)\n", "\n", " l1_svm_d2v = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) \n", " l1_svm_d2v = l1_svm_d2v.fit(l1_train_features_d2v, l1_label)\n", "\n", " l2_result_d2v = l1_svm_d2v.predict_proba(l2_test_features_d2v)[:,1]\n", " \n", " \n", " print(\"train ensemble ...\")\n", " train_data_features_ens = []\n", "\n", " for i in range(len(l2_result_bow)):\n", " vector = []\n", " vector.append(l2_result_bow[i])\n", " vector.append(l2_result_d2v[i])\n", "\n", " train_data_features_ens.append(vector)\n", "\n", " lr_ens = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)\n", " lr_ens = lr_ens.fit(train_data_features_ens, l2_label)\n", " \n", " \n", " print(\"final predict ...\")\n", " train_bow = vectorizer_bow.fit_transform(clean_train_reviews)\n", " train_bow = bsr_matrix(train_bow)\n", "\n", " test_bow = vectorizer_bow.transform(clean_test_reviews)\n", " test_bow = bsr_matrix(test_bow)\n", "\n", " train_d2v = bsr_matrix(train_data_features_d2v)\n", " test_d2v = bsr_matrix(test_data_features_d2v)\n", "\n", " lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)\n", " lr_bow = lr_bow.fit(train_bow, list(train[\"sentiment\"]))\n", "\n", " svm_d2v = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)\n", " svm_d2v = svm_d2v.fit(train_d2v, train[\"sentiment\"].values)\n", "\n", " result_bow = lr_bow.predict_proba(test_bow)[:,1]\n", " result_d2v = svm_d2v.predict_proba(test_d2v)[:,1]\n", " \n", " test_data_features_ens = []\n", " \n", " for i in range(len(result_bow)):\n", " vector = []\n", " vector.append(result_bow[i])\n", " vector.append(result_d2v[i])\n", "\n", " test_data_features_ens.append(vector)\n", "\n", " result_test_ens = lr_ens.predict_proba(test_data_features_ens)[:,1]\n", " \n", " for i in range(num_reviews):\n", " result[i] += result_test_ens[i]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "对5次的结果取平均" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": true }, "outputs": [], "source": [ "for i in range(num_reviews):\n", " result[i] /= max_iter" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [], "source": [ "result = np.array(result)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ 0.97450031, 0.02570378, 0.57902932, ..., 0.05740643,\n", " 0.97011312, 0.65017741])" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ True, False, True, ..., False, True, True], dtype=bool)" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result > 0.5" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": true }, "outputs": [], "source": [ "result_bool = result >= 0.5" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([1, 0, 1, ..., 0, 1, 1])" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result_bool * 1" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsentiment
0\"12311_10\"1
1\"8348_2\"0
2\"5828_4\"1
3\"7186_2\"0
4\"12128_7\"1
\n", "
" ], "text/plain": [ " id sentiment\n", "0 \"12311_10\" 1\n", "1 \"8348_2\" 0\n", "2 \"5828_4\" 1\n", "3 \"7186_2\" 0\n", "4 \"12128_7\" 1" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combine = pd.DataFrame(data={'id': test['id'], \n", " 'sentiment': result_bool * 1})\n", "combine.head()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "output...\n" ] } ], "source": [ "print(\"output...\")\n", "combine.to_csv('../Sentiment/result/ensemble.csv', index=False, quoting=3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "最后的结果是0.88968,我不知道作者是怎么得到0.96的,反正这样的结果也只是和combine一样罢了。" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": true }, "outputs": [], "source": [ "test_combine = pd.read_csv('../Sentiment/result/ensemble_final.csv', header=0)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsentiment
012311_100.914962
18348_20.063295
25828_40.940739
37186_20.134307
412128_70.924105
\n", "
" ], "text/plain": [ " id sentiment\n", "0 12311_10 0.914962\n", "1 8348_2 0.063295\n", "2 5828_4 0.940739\n", "3 7186_2 0.134307\n", "4 12128_7 0.924105" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_combine.head()" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": true }, "outputs": [], "source": [ "test_combine['sentiment'] = test_combine['sentiment'] >= 0.5\n", "test_combine['sentiment'] = test_combine['sentiment'].astype('int')" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsentiment
012311_101
18348_20
25828_41
37186_20
412128_71
\n", "
" ], "text/plain": [ " id sentiment\n", "0 12311_10 1\n", "1 8348_2 0\n", "2 5828_4 1\n", "3 7186_2 0\n", "4 12128_7 1" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_combine.head()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "output...\n" ] } ], "source": [ "print(\"output...\")\n", "test_combine.to_csv('../Sentiment/result/test_combine.csv', index=False, quoting=3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "呃……上面是作者原文件里的ensemble_final,我提交后也就0.89的程度……" ] } ], "metadata": { "kernelspec": { "display_name": "Python [py35]", "language": "python", "name": "Python [py35]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }