{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "把bow模型的输出结果变为概率" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] } ], "source": [ "import re\n", "from bs4 import BeautifulSoup\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from gensim.models.doc2vec import TaggedDocument\n", "\n", "\n", "def review_to_words(raw_review):\n", " review_text = BeautifulSoup(raw_review, 'lxml').get_text()\n", " letters_only = re.sub(\"[^a-zA-Z]\", \" \", review_text) \n", " words = letters_only.lower().split()\n", " stops = set(stopwords.words(\"english\"))\n", " meaningful_words = [w for w in words if not w in stops]\n", " return(\" \".join(meaningful_words))\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cleaning and parsing the training set movie reviews...\n", "Cleaning and parsing the test set movie reviews...\n" ] } ], "source": [ "# numpy\n", "import numpy as np\n", "\n", "# classifier\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "# random\n", "from random import shuffle\n", "\n", "# preprocess packages\n", "import pandas as pd\n", "\n", "\n", "'''\n", "Training Data\n", "'''\n", "train = pd.read_csv(\"../Sentiment/data/labeledTrainData.tsv\", header=0, \n", " delimiter='\\t', quoting=3, error_bad_lines=False)\n", "num_reviews = train[\"review\"].size\n", "\n", "print(\"Cleaning and parsing the training set movie reviews...\")\n", "clean_train_reviews = []\n", "for i in range(0, num_reviews):\n", " clean_train_reviews.append(review_to_words(train[\"review\"][i]))\n", "\n", "'''\n", "Test Data\n", "'''\n", "test = pd.read_csv(\"../Sentiment/data/testData.tsv\", header = 0, delimiter = \"\\t\", quoting = 3)\n", "\n", "num_reviews = len(test[\"review\"])\n", "clean_test_reviews = []\n", "\n", "print(\"Cleaning and parsing the test set movie reviews...\")\n", "for i in range(0, num_reviews):\n", " clean_review = review_to_words(test[\"review\"][i])\n", " clean_test_reviews.append(clean_review)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "构建class对象" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import heapq \n", "\n", "def select_feature(filePath, k):\n", "\tread = open(filePath, 'r')\n", "\tlab_fea = {}\n", "\t\n", "\tfor line in read:\n", "\t\tline_arr = line.strip().split()\n", "\t\tif len(line_arr) - 1 <= k:\n", "\t\t\tlab_fea[line_arr[0]] = [kv.split(':')[0] for kv in line_arr[1 : ]]\n", "\t\telse:\n", "\t\t\theap = []\n", "\t\t\theapq.heapify(heap)\n", "\t\t\tfor kv in line_arr[1 : ]:\n", "\t\t\t\tkey, val = kv.split(':')\n", "\t\t\t\tif len(heap) < k:\n", "\t\t\t\t\theapq.heappush(heap, (float(val), key))\n", "\t\t\t\telse:\n", "\t\t\t\t\tif float(val) > heap[0][0]:\n", "\t\t\t\t\t\theapq.heappop(heap)\n", "\t\t\t\t\t\theapq.heappush(heap, (float(val), key))\n", "\t\t\tlab_fea[line_arr[0]] = [heapq.heappop(heap)[1] for i in range(len(heap))]\n", "\tread.close()\n", "\treturn lab_fea" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from utils.feature_select import select_feature\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.linear_model import LogisticRegression\n", "from scipy.sparse import bsr_matrix\n", "import numpy as np\n", "\n", "class BagOfWords(object):\n", " \n", " def __init__(self, vocab = False, tfidf = False, max_feature = 1000):\n", " lab_fea = None\n", " if(vocab == True):\n", " print(\"select features...\")\n", " lab_fea = select_feature('../Sentiment/data/feature_chi.txt', max_feature)[\"1\"]\n", " \n", " self.vectorizer = None\n", " if(tfidf == True):\n", " self.vectorizer = TfidfVectorizer(analyzer = \"word\",\n", " tokenizer = None,\n", " preprocessor = None,\n", " stop_words = None,\n", " vocabulary = lab_fea,\n", " max_features = max_feature)\n", " else:\n", " self.vectorizer = CountVectorizer(analyzer = \"word\",\n", " tokenizer = None,\n", " preprocessor = None,\n", " stop_words = None,\n", " vocabulary = lab_fea,\n", " max_features = max_feature)\n", " self.lr = None\n", " \n", " def train_lr(self, train_data, lab_data, C = 1.0):\n", " train_data_features = self.vectorizer.fit_transform(train_data)\n", " train_data_features = bsr_matrix(train_data_features)\n", " print (train_data_features.shape)\n", " \n", " print(\"Training the logistic regression...\")\n", " self.lr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=C, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) \n", " self.lr = self.lr.fit(train_data_features, lab_data)\n", " \n", " def test_lr(self, test_data):\n", " test_data_features = self.vectorizer.transform(test_data)\n", " test_data_features = bsr_matrix(test_data_features)\n", " \n", " result = self.lr.predict_proba(test_data_features)[:,1]\n", " return result\n", " \n", " def validate_lr(self, train_data, lab_data, C = 1.0):\n", " train_data_features = self.vectorizer.fit_transform(train_data)\n", " train_data_features = bsr_matrix(train_data_features)\n", " lab_data = np.array(lab_data)\n", " \n", " print(\"start k-fold validate...\")\n", " lr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=C, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)\n", " cv = np.mean(cross_val_score(lr, train_data_features, lab_data, cv=10, scoring='roc_auc'))\n", " return cv\n", " " ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "select features...\n", "(25000, 19000)\n", "Training the logistic regression...\n", "[ 0.95791519 0.06738943 0.64317872 ..., 0.33803325 0.95515132\n", " 0.6234163 ]\n" ] } ], "source": [ "bow = BagOfWords(vocab = True, tfidf = True, max_feature = 19000)\n", "bow.train_lr(clean_train_reviews, list(train[\"sentiment\"]), C = 1)\n", "result = bow.test_lr(clean_test_reviews)\n", "print(result)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "output...\n" ] } ], "source": [ "print(\"output...\")\n", "output_dbow_prob = pd.DataFrame(data={'id': test['id'], 'sentiment': result})\n", "output_dbow_prob.to_csv('../Sentiment/result/bow_lr_prob.csv', index=False, quoting=3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [py35]", "language": "python", "name": "Python [py35]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }