''' Created on Mar 1, 2018 ''' from numpy import * def textParse(bigString): #input is big string, #output is word list """ 接受一个大字符串并将其解析为字符串列表。该函数去掉少于两个字符的字符串,并将所有字符串转换为小写。 """ import re listOfTokens = re.split(r'\W*', bigString) return [tok.lower() for tok in listOfTokens if len(tok) > 2] def createVocabList(dataSet): """ 创建一个包含在所有文档中出现的不重复的词的列表。 """ vocabSet = set([]) #create empty set for document in dataSet: vocabSet = vocabSet | set(document) #union of the two sets return list(vocabSet) def bagOfWords2VecMN(vocabList, inputSet): """ 获得文档向量,向量中的数值代表词汇表中的某个单词在一篇文档中的出现次数 """ returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] += 1 return returnVec