# coding=utf-8
import pandas as pd
import re
from skTrain import cut2wd


class split():
    def __init__(self):
        self.all = pd.read_csv("train&test/train.csv")
        self.pos = (self.all.loc[self.all['情感倾向'] == 1])[
            '微博中文内容'].values.tolist()
        self.neg = (self.all.loc[self.all['情感倾向'] == -1]
                    )['微博中文内容'].values.tolist()
        self.mid = (self.all.loc[self.all['情感倾向'] == 0])[
            '微博中文内容'].values.tolist()

    def cleanComment(self, comment):
        comment = re.sub('#.*#', '', comment)
        comment = re.sub('//@.*:', '', comment)
        comment = re.sub('//@.*：', '', comment)
        comment = re.sub('//.*:', '', comment)
        comment = re.sub('//.*：', '', comment)
        comment = re.sub('【.*】', '', comment)
        comment = re.sub('《.*》', '', comment)
        comment = re.sub('//.*//', '', comment)
        comment = re.sub('@.*：', '', comment)
        comment = re.sub('@.*:', '', comment)
        comment = re.sub('『.*』', '', comment)
        comment = re.sub(r'\d', '', comment)
        return comment

    def startSplit(self, posnum, negnum, midnum, testnum):
        file = open("train&test/pos.txt", 'w')
        for comment in self.pos[0:posnum]:
            comment = self.cleanComment(comment)
            if comment != '':
                file.write(comment)
                file.write('\n')
        file.close()

        file = open("train&test/neg.txt", 'w')
        for comment in self.neg[0:negnum]:
            comment = self.cleanComment(comment)
            if comment != '':
                file.write(comment)
                file.write('\n')
        file.close()

        file = open("train&test/mid.txt", 'w')
        for comment in self.mid[0:midnum]:
            comment = self.cleanComment(comment)
            if comment != '':
                file.write(comment)
                file.write('\n')
        file.close()

        data = pd.read_csv('train&test/test.csv', encoding='utf-8')
        with open('train&test/test.txt', 'w', encoding='utf-8') as f:
            for line in data.values[0:testnum]:
                testline = self.cleanComment(str(line[0]))
                f.write((testline + '\n'))