进行较为复杂的ensemble方法

In [1]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument


def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, 'lxml').get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return(" ".join(meaningful_words))


def tag_reviews(reviews, prefix):
    tagged = []
    for i, review in enumerate(reviews):
        tagged.append(TaggedDocument(words=review.split(), tags=[prefix + '_%s' % i]))
    return tagged

Using TensorFlow backend.


In [2]:
# gensim modules
from gensim.models import Doc2Vec

# numpy
import numpy as np

# classifier
from sklearn.linear_model import LogisticRegression

# random
from random import shuffle

# preprocess packages
import pandas as pd
# import sys
# sys.path.insert(0, '..')
# from utils.TextPreprocess import review_to_words, tag_reviews


'''
Training Data
'''
train = pd.read_csv("../Sentiment/data/labeledTrainData.tsv", header=0, 
                         delimiter='\t', quoting=3, error_bad_lines=False)
num_reviews = train["review"].size

print("Cleaning and parsing the training set movie reviews...")
clean_train_reviews = []
for i in range(0, num_reviews):
    clean_train_reviews.append(review_to_words(train["review"][i]))

'''
Test Data
'''
test = pd.read_csv("../Sentiment/data/testData.tsv", header = 0, delimiter = "\t", quoting = 3)

num_reviews = len(test["review"])
clean_test_reviews = []

print("Cleaning and parsing the test set movie reviews...")
for i in range(0, num_reviews):
    clean_review = review_to_words(test["review"][i])
    clean_test_reviews.append(clean_review)


# # Unlabeled Train Data
# unlabeled_reviews = pd.read_csv("../Sentiment/data/unlabeledTrainData.tsv", header = 0, delimiter = "\t", quoting = 3)
# num_reviews = len(unlabeled_reviews["review"])
# clean_unlabeled_reviews = []

# print("Cleaning and parsing the test set movie reviews...")
# for i in range( 0, num_reviews):
#     if( (i+1)%5000 == 0 ):
#         print("Review %d of %d\n" % (i+1, num_reviews))
#     clean_review = review_to_words(unlabeled_reviews["review"][i])
#     clean_unlabeled_reviews.append(clean_review)

Cleaning and parsing the training set movie reviews...
Cleaning and parsing the test set movie reviews...
Cleaning and parsing the test set movie reviews...
Review 5000 of 50000

Review 10000 of 50000

Review 15000 of 50000

Review 20000 of 50000

Review 25000 of 50000

Review 30000 of 50000

Review 35000 of 50000

Review 40000 of 50000

Review 45000 of 50000

Review 50000 of 50000



把训练好的doc2vec模型导入，得到train和test的sentence vector

In [3]:
train_data_features_d2v = []
test_data_features_d2v = []

In [4]:
model_dbow = Doc2Vec.load('../Sentiment/src/deep/model/doc2vec_lr100')

呃，发现还需要train_tagged这样有tag信息的对象才能读取。我还是直接把处理好的vector保存好得了。在Part 2.9进行保存

In [5]:
train_data_features_d2v = np.loadtxt('../Sentiment/data/train_feature_d2v.txt')
test_data_features_d2v = np.loadtxt('../Sentiment/data/test_feature_d2v.txt')

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import bsr_matrix
from sklearn.svm import SVC


In [10]:
num_reviews = len(test["review"])

In [11]:
result = [0.0 for i in range(num_reviews)]

In [12]:
len(result)

25000

In [13]:
import random

def sample(train_bow, train_d2v, label):
    num = len(label)
    index_set = set(random.sample(range(num), int(num / 2)))
    
    l1_train_bow = []
    l1_train_d2v = []
    l1_label = []
    
    l2_train_bow = []
    l2_train_d2v = []
    l2_label = []
    
    for i in range(num):
        if i in index_set:
            l1_train_bow.append(train_bow[i])
            l1_train_d2v.append(train_d2v[i])
            l1_label.append(label[i])
        else:
            l2_train_bow.append(train_bow[i])
            l2_train_d2v.append(train_d2v[i])
            l2_label.append(label[i])
    
    return l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label


In [20]:
print(len(clean_train_reviews))
print(train_data_features_d2v.shape)
print(len(train["sentiment"].values))

25000
(25000, 100)
25000


In [14]:
l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample(clean_train_reviews, train_data_features_d2v, train["sentiment"].values)

In [18]:
print(len(l1_train_bow))
print(len(l1_train_d2v))
print(len(l2_train_bow))
print(len(l2_train_d2v))
print(len(l1_label))
print(len(l2_label))

12500
12500
12500
12500
12500
12500


我想搞清楚这个sample()函数究竟在干什么。

明白了，`random.sample(range(25000), 12500)`，其实就是从25000个数字里，随机调出12500个。这里又多加了个set，感觉有点多余

In [22]:
num = len(train["sentiment"].values) # num = 25000
index_set = set(random.sample(range(num), int(num / 2)))

明白了，整个sample函数，其实就是把12500个训练集平均分成了两部分，训练集相关数据有clean_train_reviews（实际的sentence），train_data_features_d2v（经过doc2vec处理的sentence vector）。对应的标签也分成了两部分。

In [26]:
import heapq 

def select_feature(filePath, k):
	read = open(filePath, 'r')
	lab_fea = {}
	
	for line in read:
		line_arr = line.strip().split()
		if len(line_arr) - 1 <= k:
			lab_fea[line_arr[0]] = [kv.split(':')[0] for kv in line_arr[1 : ]]
		else:
			heap = []
			heapq.heapify(heap)
			for kv in line_arr[1 : ]:
				key, val = kv.split(':')
				if len(heap) < k:
					heapq.heappush(heap, (float(val), key))
				else:
					if float(val) > heap[0][0]:
						heapq.heappop(heap)
						heapq.heappush(heap, (float(val), key))
			lab_fea[line_arr[0]] = [heapq.heappop(heap)[1] for i in range(len(heap))]
	read.close()
	return lab_fea

lab_fea = select_feature('feature_chi.txt', 1000)['1']

我们一般处理方式有2种：
1）对数据先fit，再transform，好处是我可以拿到数据变换(比如scaling/幅度变换/标准化)的参数，这样你可以在测试集上也一样做相同的数据变换处理。即先对训练集做fit，然后再对训练集和测试集做transform
2）fit_trainsform，一次性完成数据的变换(比如scaling/幅度变换/标准化)，比较快。但是如果在训练集和测试集上用fit_trainsform，可能执行的是两套变换标准(因为训练集和测试集幅度不一样)

[这个解释](https://datascience.stackexchange.com/questions/12321/difference-between-fit-and-fit-transform-in-scikit-learn-models)的也很清楚，transform主要就是为了做中心化之类的预处理操作，让数据更好用一些。

In [27]:
print("training bow ...")
vectorizer_bow = TfidfVectorizer(analyzer = "word",
                                 tokenizer = None,
                                 preprocessor = None,
                                 stop_words = None,
                                 vocabulary = lab_fea,
                                 max_features = 19000)

l1_train_features_bow = vectorizer_bow.fit_transform(l1_train_bow)
l1_train_features_bow = bsr_matrix(l1_train_features_bow)

l1_lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) 
l1_lr_bow = l1_lr_bow.fit(l1_train_features_bow, l1_label)

l2_test_features_bow = vectorizer_bow.transform(l2_train_bow)
l2_test_features_bow = bsr_matrix(l2_test_features_bow)

l2_result_bow = l1_lr_bow.predict_proba(l2_test_features_bow)[:,1]

training bow ...


上面是先训练了一个TfidfVectorizer，对l1_train_bow（即12500个sentence）进行计算得到了l1_train_features_bow（代表每个sentence的特征向量，每个sentence 1000维）。然后用LR对(l1_train_features_bow, l1_label)进行了训练。然后把训练好的模型，对l2_train_bow（l2_test_features_bow）进行了预测。

In [28]:
l1_train_features_bow.shape

(12500, 1000)

In [31]:
print("train doc2vec ...")
    
l1_train_features_d2v = bsr_matrix(l1_train_d2v)
l2_test_features_d2v = bsr_matrix(l2_train_d2v)

l1_svm_d2v = SVC(C = 1.0, kernel='rbf', gamma = 'auto', probability=True)
l1_svm_d2v = l1_svm_d2v.fit(l1_train_features_d2v, l1_label)

l2_result_d2v = l1_svm_d2v.predict_proba(l2_test_features_d2v)[:,1]

train doc2vec ...


In [32]:
l2_result_d2v.shape

(12500,)

上面也是，只拿了12500个doc2vec向量，l1_train_d2v，来做训练，分类器是svm，然后对l2_train_d2v进行了预测。

In [60]:
print("train ensemble ...")
    
train_data_features_ens = []

for i in range(len(l2_result_bow)):
    vector = []
    vector.append(l2_result_bow[i])
    vector.append(l2_result_d2v[i])

    train_data_features_ens.append(vector)

lr_ens = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
lr_ens = lr_ens.fit(train_data_features_ens, l2_label)

train ensemble ...


In [61]:
train_data_features_ens[:2]

[[0.23886548792325121, 0.15337969425958606],
 [0.81253080751969953, 0.88316226120104124]]

这里的一个vector包含两个数字`[l2_result_bow[i], l2_result_d2v[i]]`，所以这里我们得到的train_data_features_ens大概是这样的一个形式`[[l2_result_bow[0], l2_result_d2v[0]], [l2_result_bow[1], l2_result_d2v[1]]]`，写成数字就是上面那样的输出。

In [35]:
print("final predict ...")
train_bow = vectorizer_bow.fit_transform(clean_train_reviews)
train_bow = bsr_matrix(train_bow)

test_bow = vectorizer_bow.transform(clean_test_reviews)
test_bow = bsr_matrix(test_bow)

train_d2v = bsr_matrix(train_data_features_d2v)
test_d2v = bsr_matrix(test_data_features_d2v)

lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
lr_bow = lr_bow.fit(train_bow, list(train["sentiment"]))

svm_d2v = SVC(C = 1.0, kernel='rbf', gamma = 'auto', probability=True)
svm_d2v = svm_d2v.fit(train_d2v, train["sentiment"].values)

result_bow = lr_bow.predict_proba(test_bow)[:,1]
result_d2v = svm_d2v.predict_proba(test_d2v)[:,1]

final predict ...


In [None]:
test_data_features_ens = []
    
for i in xrange(len(result_bow)):
    vector = []
    vector.append(result_bow[i])
    vector.append(result_d2v[i])

    test_data_features_ens.append(vector)

result_test_ens = lr_ens.predict_proba(test_data_features_ens)[:,1]

In [62]:
test_data_features_ens[:2]

[[0.99794119998088815, 0.99910189748012901],
 [0.018351063999300796, 0.0014584329684225311]]

上面所有都结束后，就算是一次epoch结束了。之后应该把结果都加到result里，然后除以epoch次数，得到平均预测概率。感觉从31开始就有点看不懂了。下面把所有的都内容都完成写一遍，为了加快速度，把svc变为lr：

In [38]:
result = [0.0 for i in range(num_reviews)]
len(result)

25000

In [41]:
max_iter = 5
for epoch in range(max_iter):
    print("epoch: " + str(epoch))
    l1_train_bow, l1_train_d2v, l2_train_bow, l2_train_d2v, l1_label, l2_label = sample(clean_train_reviews, train_data_features_d2v, train["sentiment"].values)
    
    
    print("training bow ...")
    vectorizer_bow = TfidfVectorizer(analyzer = "word",
                                     tokenizer = None,
                                     preprocessor = None,
                                     stop_words = None,
                                     vocabulary = lab_fea,
                                     max_features = 19000)

    l1_train_features_bow = vectorizer_bow.fit_transform(l1_train_bow)
    l1_train_features_bow = bsr_matrix(l1_train_features_bow)

    l1_lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) 
    l1_lr_bow = l1_lr_bow.fit(l1_train_features_bow, l1_label)

    l2_test_features_bow = vectorizer_bow.transform(l2_train_bow)
    l2_test_features_bow = bsr_matrix(l2_test_features_bow)

    l2_result_bow = l1_lr_bow.predict_proba(l2_test_features_bow)[:,1]
    
    
    print("train doc2vec ...")
    l1_train_features_d2v = bsr_matrix(l1_train_d2v)
    l2_test_features_d2v = bsr_matrix(l2_train_d2v)

    l1_svm_d2v = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) 
    l1_svm_d2v = l1_svm_d2v.fit(l1_train_features_d2v, l1_label)

    l2_result_d2v = l1_svm_d2v.predict_proba(l2_test_features_d2v)[:,1]
    
    
    print("train ensemble ...")
    train_data_features_ens = []

    for i in range(len(l2_result_bow)):
        vector = []
        vector.append(l2_result_bow[i])
        vector.append(l2_result_d2v[i])

        train_data_features_ens.append(vector)

    lr_ens = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
    lr_ens = lr_ens.fit(train_data_features_ens, l2_label)
    
    
    print("final predict ...")
    train_bow = vectorizer_bow.fit_transform(clean_train_reviews)
    train_bow = bsr_matrix(train_bow)

    test_bow = vectorizer_bow.transform(clean_test_reviews)
    test_bow = bsr_matrix(test_bow)

    train_d2v = bsr_matrix(train_data_features_d2v)
    test_d2v = bsr_matrix(test_data_features_d2v)

    lr_bow = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
    lr_bow = lr_bow.fit(train_bow, list(train["sentiment"]))

    svm_d2v = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
    svm_d2v = svm_d2v.fit(train_d2v, train["sentiment"].values)

    result_bow = lr_bow.predict_proba(test_bow)[:,1]
    result_d2v = svm_d2v.predict_proba(test_d2v)[:,1]
    
    test_data_features_ens = []
    
    for i in range(len(result_bow)):
        vector = []
        vector.append(result_bow[i])
        vector.append(result_d2v[i])

        test_data_features_ens.append(vector)

    result_test_ens = lr_ens.predict_proba(test_data_features_ens)[:,1]
    
    for i in range(num_reviews):
        result[i] += result_test_ens[i]

epoch: 0
training bow ...
train doc2vec ...
train ensemble ...
final predict ...
epoch: 1
training bow ...
train doc2vec ...
train ensemble ...
final predict ...
epoch: 2
training bow ...
train doc2vec ...
train ensemble ...
final predict ...
epoch: 3
training bow ...
train doc2vec ...
train ensemble ...
final predict ...
epoch: 4
training bow ...
train doc2vec ...
train ensemble ...
final predict ...


对5次的结果取平均

In [42]:
for i in range(num_reviews):
    result[i] /= max_iter

In [45]:
result = np.array(result)

In [46]:
result

array([ 0.97450031,  0.02570378,  0.57902932, ...,  0.05740643,
        0.97011312,  0.65017741])

In [47]:
result > 0.5

array([ True, False,  True, ..., False,  True,  True], dtype=bool)

In [48]:
result_bool = result >= 0.5

In [50]:
result_bool * 1

array([1, 0, 1, ..., 0, 1, 1])

In [51]:
combine = pd.DataFrame(data={'id': test['id'], 
                             'sentiment': result_bool * 1})
combine.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",0
4,"""12128_7""",1


In [52]:
print("output...")
combine.to_csv('../Sentiment/result/ensemble.csv', index=False, quoting=3)

output...


最后的结果是0.88968，我不知道作者是怎么得到0.96的，反正这样的结果也只是和combine一样罢了。

In [53]:
test_combine = pd.read_csv('../Sentiment/result/ensemble_final.csv', header=0)

In [54]:
test_combine.head()

Unnamed: 0,id,sentiment
0,12311_10,0.914962
1,8348_2,0.063295
2,5828_4,0.940739
3,7186_2,0.134307
4,12128_7,0.924105


In [55]:
test_combine['sentiment'] = test_combine['sentiment'] >= 0.5
test_combine['sentiment'] = test_combine['sentiment'].astype('int')

In [57]:
test_combine.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,0
4,12128_7,1


In [56]:
print("output...")
test_combine.to_csv('../Sentiment/result/test_combine.csv', index=False, quoting=3)

output...


呃……上面是作者原文件里的ensemble_final，我提交后也就0.89的程度……