把bow模型的输出结果变为概率

In [2]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument


def review_to_words(raw_review):
 review_text = BeautifulSoup(raw_review, 'lxml').get_text()
 letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
 words = letters_only.lower().split()
 stops = set(stopwords.words("english"))
 meaningful_words = [w for w in words if not w in stops]
 return(" ".join(meaningful_words))


Using TensorFlow backend.


In [3]:
# numpy
import numpy as np

# classifier
from sklearn.linear_model import LogisticRegression

# random
from random import shuffle

# preprocess packages
import pandas as pd


'''
Training Data
'''
train = pd.read_csv("../Sentiment/data/labeledTrainData.tsv", header=0, 
 delimiter='\t', quoting=3, error_bad_lines=False)
num_reviews = train["review"].size

print("Cleaning and parsing the training set movie reviews...")
clean_train_reviews = []
for i in range(0, num_reviews):
 clean_train_reviews.append(review_to_words(train["review"][i]))

'''
Test Data
'''
test = pd.read_csv("../Sentiment/data/testData.tsv", header = 0, delimiter = "\t", quoting = 3)

num_reviews = len(test["review"])
clean_test_reviews = []

print("Cleaning and parsing the test set movie reviews...")
for i in range(0, num_reviews):
 clean_review = review_to_words(test["review"][i])
 clean_test_reviews.append(clean_review)

Cleaning and parsing the training set movie reviews...
Cleaning and parsing the test set movie reviews...


构建class对象

In [7]:
import heapq 

def select_feature(filePath, k):
	read = open(filePath, 'r')
	lab_fea = {}
	
	for line in read:
		line_arr = line.strip().split()
		if len(line_arr) - 1 <= k:
			lab_fea[line_arr[0]] = [kv.split(':')[0] for kv in line_arr[1 : ]]
		else:
			heap = []
			heapq.heapify(heap)
			for kv in line_arr[1 : ]:
				key, val = kv.split(':')
				if len(heap) < k:
					heapq.heappush(heap, (float(val), key))
				else:
					if float(val) > heap[0][0]:
						heapq.heappop(heap)
						heapq.heappush(heap, (float(val), key))
			lab_fea[line_arr[0]] = [heapq.heappop(heap)[1] for i in range(len(heap))]
	read.close()
	return lab_fea

In [8]:
from utils.feature_select import select_feature
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from scipy.sparse import bsr_matrix
import numpy as np

class BagOfWords(object):
 
 def __init__(self, vocab = False, tfidf = False, max_feature = 1000):
 lab_fea = None
 if(vocab == True):
 print("select features...")
 lab_fea = select_feature('../Sentiment/data/feature_chi.txt', max_feature)["1"]
 
 self.vectorizer = None
 if(tfidf == True):
 self.vectorizer = TfidfVectorizer(analyzer = "word",
 tokenizer = None,
 preprocessor = None,
 stop_words = None,
 vocabulary = lab_fea,
 max_features = max_feature)
 else:
 self.vectorizer = CountVectorizer(analyzer = "word",
 tokenizer = None,
 preprocessor = None,
 stop_words = None,
 vocabulary = lab_fea,
 max_features = max_feature)
 self.lr = None
 
 def train_lr(self, train_data, lab_data, C = 1.0):
 train_data_features = self.vectorizer.fit_transform(train_data)
 train_data_features = bsr_matrix(train_data_features)
 print (train_data_features.shape)
 
 print("Training the logistic regression...")
 self.lr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=C, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) 
 self.lr = self.lr.fit(train_data_features, lab_data)
 
 def test_lr(self, test_data):
 test_data_features = self.vectorizer.transform(test_data)
 test_data_features = bsr_matrix(test_data_features)
 
 result = self.lr.predict_proba(test_data_features)[:,1]
 return result
 
 def validate_lr(self, train_data, lab_data, C = 1.0):
 train_data_features = self.vectorizer.fit_transform(train_data)
 train_data_features = bsr_matrix(train_data_features)
 lab_data = np.array(lab_data)
 
 print("start k-fold validate...")
 lr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=C, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
 cv = np.mean(cross_val_score(lr, train_data_features, lab_data, cv=10, scoring='roc_auc'))
 return cv
 

In [9]:
bow = BagOfWords(vocab = True, tfidf = True, max_feature = 19000)
bow.train_lr(clean_train_reviews, list(train["sentiment"]), C = 1)
result = bow.test_lr(clean_test_reviews)
print(result)

select features...
(25000, 19000)
Training the logistic regression...
[ 0.95791519 0.06738943 0.64317872 ..., 0.33803325 0.95515132
 0.6234163 ]


In [10]:
print("output...")
output_dbow_prob = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
output_dbow_prob.to_csv('../Sentiment/result/bow_lr_prob.csv', index=False, quoting=3)

output...
