{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## 案例1 过滤网站的恶意留言" ] }, { "cell_type": "code", "execution_count": 178, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from numpy import *" ] }, { "cell_type": "code", "execution_count": 179, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def loadDataSet():\n", " \"\"\"\n", " 创建实验样本\n", " return: 单词列表postingList, 所属类别classVec\n", " \"\"\"\n", " postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #[0,0,1,1,1......]\n", " ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\n", " ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\n", " ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\n", " ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\n", " ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]\n", " classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not\n", " return postingList, classVec" ] }, { "cell_type": "code", "execution_count": 180, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def createVocabList(dataSet):\n", " \"\"\"\n", " 获取所有单词的集合\n", " :param dataSet: 数据集\n", " :return: 所有单词的集合(即不含重复元素的单词列表)\n", " \"\"\"\n", " vocabSet = set([]) # create empty set\n", " for document in dataSet:\n", " # 操作符 | 用于求两个集合的并集\n", " vocabSet = vocabSet | set(document) # union of the two sets\n", " return list(vocabSet)\n" ] }, { "cell_type": "code", "execution_count": 181, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def setOfWords2Vec(vocabList, inputSet):\n", " \"\"\"\n", " 遍历查看该单词是否出现,出现该单词则将该单词置1\n", " :param vocabList: 所有单词集合列表\n", " :param inputSet: 输入数据集\n", " :return: 匹配列表[0,1,0,1...],其中 1与0 表示词汇表中的单词是否出现在输入的数据集中\n", " \"\"\"\n", " # 创建一个和词汇表等长的向量,并将其元素都设置为0\n", " returnVec = [0] * len(vocabList)# [0,0......]\n", " # 遍历文档中的所有单词,如果出现了词汇表中的单词,则将输出的文档向量中的对应值设为1\n", " for word in inputSet:\n", " if word in vocabList:\n", " returnVec[vocabList.index(word)] = 1\n", " else:\n", " print (\"the word: %s is not in my Vocabulary!\" % word)\n", " return returnVec" ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['so',\n", " 'maybe',\n", " 'not',\n", " 'cute',\n", " 'to',\n", " 'stop',\n", " 'worthless',\n", " 'food',\n", " 'quit',\n", " 'park',\n", " 'how',\n", " 'stupid',\n", " 'love',\n", " 'dalmation',\n", " 'flea',\n", " 'posting',\n", " 'ate',\n", " 'steak',\n", " 'my',\n", " 'garbage',\n", " 'dog',\n", " 'help',\n", " 'him',\n", " 'is',\n", " 'licks',\n", " 'I',\n", " 'problems',\n", " 'buying',\n", " 'has',\n", " 'take',\n", " 'mr',\n", " 'please']" ] }, "execution_count": 182, "metadata": {}, "output_type": "execute_result" } ], "source": [ "listOPosts, listClasses = loadDataSet()\n", "myVocabList = createVocabList(listOPosts)\n", "myVocabList" ] }, { "cell_type": "code", "execution_count": 183, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 0,\n", " 1,\n", " 1,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 0,\n", " 1,\n", " 0,\n", " 0,\n", " 1]" ] }, "execution_count": 183, "metadata": {}, "output_type": "execute_result" } ], "source": [ "setOfWords2Vec(myVocabList, listOPosts[0])" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 1,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 0,\n", " 0,\n", " 0,\n", " 1,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0]" ] }, "execution_count": 184, "metadata": {}, "output_type": "execute_result" } ], "source": [ "setOfWords2Vec(myVocabList, listOPosts[3])" ] }, { "cell_type": "code", "execution_count": 185, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def _trainNB0(trainMatrix, trainCategory):\n", " \"\"\"\n", " 训练数据原版\n", " :param trainMatrix: 文件单词矩阵 [[1,0,1,1,1....],[],[]...]\n", " :param trainCategory: 文件对应的类别[0,1,1,0....],列表长度等于单词矩阵数,其中的1代表对应的文件是侮辱性文件,0代表不是侮辱性矩阵\n", " :return:\n", " \"\"\"\n", " # 文件数\n", " numTrainDocs = len(trainMatrix)\n", " # 单词数\n", " numWords = len(trainMatrix[0])\n", " # 侮辱性文件的出现概率,即trainCategory中所有的1的个数,\n", " # 代表的就是多少个侮辱性文件,与文件的总数相除就得到了侮辱性文件的出现概率\n", " pAbusive = sum(trainCategory) / float(numTrainDocs)\n", " # 构造单词出现次数列表\n", " p0Num = zeros(numWords) # [0,0,0,.....]\n", " p1Num = zeros(numWords) # [0,0,0,.....]\n", "\n", " # 整个数据集单词出现总数\n", " p0Denom = 0.0\n", " p1Denom = 0.0\n", " for i in range(numTrainDocs):\n", " # 遍历所有的文件,如果是侮辱性文件,就计算此侮辱性文件中出现的侮辱性单词的个数\n", " if trainCategory[i] == 1:\n", " p1Num += trainMatrix[i] #[0,1,1,....]->[0,1,1,...]\n", " p1Denom += sum(trainMatrix[i])\n", " else:\n", " # 如果不是侮辱性文件,则计算非侮辱性文件中出现的侮辱性单词的个数\n", " p0Num += trainMatrix[i]\n", " p0Denom += sum(trainMatrix[i])\n", " # 类别1,即侮辱性文档的[P(F1|C1),P(F2|C1),P(F3|C1),P(F4|C1),P(F5|C1)....]列表\n", " # 即 在1类别下,每个单词出现次数的占比\n", " p1Vect = p1Num / p1Denom# [1,2,3,5]/90->[1/90,...]\n", " # 类别0,即正常文档的[P(F1|C0),P(F2|C0),P(F3|C0),P(F4|C0),P(F5|C0)....]列表\n", " # 即 在0类别下,每个单词出现次数的占比\n", " p0Vect = p0Num / p0Denom\n", " return p0Vect, p1Vect, pAbusive" ] }, { "cell_type": "code", "execution_count": 186, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def trainNB0(trainMatrix, trainCategory):\n", " \"\"\"\n", " 训练数据优化版本\n", " :param trainMatrix: 文件单词矩阵\n", " :param trainCategory: 文件对应的类别\n", " :return:\n", " \"\"\"\n", " # 总文件数\n", " numTrainDocs = len(trainMatrix)\n", " # 总单词数\n", " numWords = len(trainMatrix[0])\n", " # 侮辱性文件的出现概率\n", " pAbusive = sum(trainCategory) / float(numTrainDocs)\n", " # 构造单词出现次数列表\n", " # p0Num 正常的统计\n", " # p1Num 侮辱的统计 \n", " # 避免单词列表中的任何一个单词为0,而导致最后的乘积为0,所以将每个单词的出现次数初始化为 1\n", " p0Num = ones(numWords)#[0,0......]->[1,1,1,1,1.....]\n", " p1Num = ones(numWords)\n", "\n", " # 整个数据集单词出现总数,2.0根据样本/实际调查结果调整分母的值(2主要是避免分母为0,当然值可以调整)\n", " # p0Denom 正常的统计\n", " # p1Denom 侮辱的统计\n", " p0Denom = 2.0\n", " p1Denom = 2.0\n", " for i in range(numTrainDocs):\n", " if trainCategory[i] == 1:\n", " # 累加辱骂词的频次\n", " p1Num += trainMatrix[i]\n", " # 对每篇文章的辱骂的频次 进行统计汇总\n", " p1Denom += sum(trainMatrix[i])\n", " else:\n", " p0Num += trainMatrix[i]\n", " p0Denom += sum(trainMatrix[i])\n", " # 类别1,即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表\n", " p1Vect = log(p1Num / p1Denom)\n", " # 类别0,即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表\n", " p0Vect = log(p0Num / p0Denom)\n", " return p0Vect, p1Vect, pAbusive" ] }, { "cell_type": "code", "execution_count": 187, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "6\n", "3\n" ] } ], "source": [ "listOPosts, listClasses = loadDataSet()\n", "myVocalist = createVocabList(listOPosts)\n", "trainMat = []\n", "for postinDoc in listOPosts:\n", " trainMat.append(setOfWords2Vec(myVocabList, postinDoc))\n", " \n", "print (len(trainMat))\n", "print (sum(listClasses))" ] }, { "cell_type": "code", "execution_count": 188, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5" ] }, "execution_count": 188, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p0V, p1V, pAb = trainNB0(trainMat, listClasses)\n", "pAb" ] }, { "cell_type": "code", "execution_count": 189, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-2.56494936, -3.25809654, -3.25809654, -2.56494936, -2.56494936,\n", " -2.56494936, -3.25809654, -3.25809654, -3.25809654, -3.25809654,\n", " -2.56494936, -3.25809654, -2.56494936, -2.56494936, -2.56494936,\n", " -3.25809654, -2.56494936, -2.56494936, -1.87180218, -3.25809654,\n", " -2.56494936, -2.56494936, -2.15948425, -2.56494936, -2.56494936,\n", " -2.56494936, -2.56494936, -3.25809654, -2.56494936, -3.25809654,\n", " -2.56494936, -2.56494936])" ] }, "execution_count": 189, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p0V" ] }, { "cell_type": "code", "execution_count": 190, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):\n", " \"\"\"\n", " 使用算法:\n", " # 将乘法转换为加法\n", " 乘法:P(C|F1F2...Fn) = P(F1F2...Fn|C)P(C)/P(F1F2...Fn)\n", " 加法:P(F1|C)*P(F2|C)....P(Fn|C)P(C) -> log(P(F1|C))+log(P(F2|C))+....+log(P(Fn|C))+log(P(C))\n", " :param vec2Classify: 待测数据[0,1,1,1,1...],即要分类的向量\n", " :param p0Vec: 类别0,即正常文档的[log(P(F1|C0)),log(P(F2|C0)),log(P(F3|C0)),log(P(F4|C0)),log(P(F5|C0))....]列表\n", " :param p1Vec: 类别1,即侮辱性文档的[log(P(F1|C1)),log(P(F2|C1)),log(P(F3|C1)),log(P(F4|C1)),log(P(F5|C1))....]列表\n", " :param pClass1: 类别1,侮辱性文件的出现概率\n", " :return: 类别1 or 0\n", " \"\"\"\n", " # 计算公式 log(P(F1|C))+log(P(F2|C))+....+log(P(Fn|C))+log(P(C))\n", " # 使用 NumPy 数组来计算两个向量相乘的结果,这里的相乘是指对应元素相乘,即先将两个向量中的第一个元素相乘,然后将第2个元素相乘,以此类推。\n", " # 我的理解是:这里的 vec2Classify * p1Vec 的意思就是将每个词与其对应的概率相关联起来\n", " # 可以理解为 1.单词在词汇表中的条件下,文件是good 类别的概率 也可以理解为 2.在整个空间下,文件既在词汇表中又是good类别的概率\n", " p1 = sum(vec2Classify * p1Vec) + log(pClass1)\n", " p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)\n", " if p1 > p0:\n", " return 1\n", " else:\n", " return 0" ] }, { "cell_type": "code", "execution_count": 191, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def testingNB():\n", " \"\"\"\n", " 测试朴素贝叶斯算法\n", " \"\"\"\n", " # 1. 加载数据集\n", " listOPosts, listClasses = loadDataSet()\n", " # 2. 创建单词集合\n", " myVocabList = createVocabList(listOPosts)\n", " # 3. 计算单词是否出现并创建数据矩阵\n", " trainMat = []\n", " for postinDoc in listOPosts:\n", " # 返回m*len(myVocabList)的矩阵, 记录的都是0,1信息\n", " trainMat.append(setOfWords2Vec(myVocabList, postinDoc))\n", " # 4. 训练数据\n", " p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))\n", " # 5. 测试数据\n", " testEntry = ['love', 'my', 'dalmation']\n", " thisDoc = array(setOfWords2Vec(myVocabList, testEntry))\n", " print (testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))\n", " testEntry = ['stupid', 'garbage']\n", " thisDoc = array(setOfWords2Vec(myVocabList, testEntry))\n", " print (testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))" ] }, { "cell_type": "code", "execution_count": 192, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['love', 'my', 'dalmation'] classified as: 0\n", "['stupid', 'garbage'] classified as: 1\n" ] } ], "source": [ "testingNB()" ] }, { "cell_type": "code", "execution_count": 193, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#朴素贝叶斯词袋模型\n", "def bagOfWords2VecMN(vocabList, inputSet):\n", " returnVec = [0] * len(vocabList)\n", " for word in inputSet:\n", " if word in vocabList:\n", " returnVec[vocabList.index(word)] += 1\n", " return returnVec" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 案例2 使用贝叶斯过滤垃圾邮件" ] }, { "cell_type": "code", "execution_count": 194, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['This',\n", " 'book',\n", " 'is',\n", " 'the',\n", " 'best',\n", " 'book',\n", " 'pn',\n", " 'python',\n", " 'or',\n", " 'M.L.',\n", " 'I',\n", " 'have',\n", " 'ever',\n", " 'laid',\n", " 'eyes',\n", " 'upon.']" ] }, "execution_count": 194, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mySent = 'This book is the best book pn python or M.L. I have ever laid eyes upon.'\n", "mySent.split()" ] }, { "cell_type": "code", "execution_count": 195, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/kyzhang/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: split() requires a non-empty pattern match.\n", " This is separate from the ipykernel package so we can avoid doing imports until\n" ] }, { "data": { "text/plain": [ "['This',\n", " 'book',\n", " 'is',\n", " 'the',\n", " 'best',\n", " 'book',\n", " 'pn',\n", " 'python',\n", " 'or',\n", " 'M',\n", " 'L',\n", " 'I',\n", " 'have',\n", " 'ever',\n", " 'laid',\n", " 'eyes',\n", " 'upon',\n", " '']" ] }, "execution_count": 195, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "regEx = re.compile('\\\\W*')\n", "listOfTokens = regEx.split(mySent)\n", "listOfTokens" ] }, { "cell_type": "code", "execution_count": 196, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['This',\n", " 'book',\n", " 'is',\n", " 'the',\n", " 'best',\n", " 'book',\n", " 'pn',\n", " 'python',\n", " 'or',\n", " 'M',\n", " 'L',\n", " 'I',\n", " 'have',\n", " 'ever',\n", " 'laid',\n", " 'eyes',\n", " 'upon']" ] }, "execution_count": 196, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[tok for tok in listOfTokens if len(tok) > 0]" ] }, { "cell_type": "code", "execution_count": 197, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['this',\n", " 'book',\n", " 'is',\n", " 'the',\n", " 'best',\n", " 'book',\n", " 'pn',\n", " 'python',\n", " 'or',\n", " 'm',\n", " 'l',\n", " 'i',\n", " 'have',\n", " 'ever',\n", " 'laid',\n", " 'eyes',\n", " 'upon']" ] }, "execution_count": 197, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[tok.lower() for tok in listOfTokens if len(tok) > 0]" ] }, { "cell_type": "code", "execution_count": 198, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/kyzhang/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: split() requires a non-empty pattern match.\n", " \n" ] } ], "source": [ "emailText = open('email/ham/18.txt').read()\n", "listOfTokens = regEx.split(emailText)" ] }, { "cell_type": "code", "execution_count": 199, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 切分文本\n", "def textParse(bigString):\n", " '''\n", " Desc:\n", " 接收一个大字符串并将其解析为字符串列表\n", " Args:\n", " bigString -- 大字符串\n", " Returns:\n", " 去掉少于 2 个字符的字符串,并将所有字符串转换为小写,返回字符串列表\n", " '''\n", " import re\n", " # 使用正则表达式来切分句子,其中分隔符是除单词、数字外的任意字符串\n", " listOfTokens = re.split(r'\\W*', bigString)\n", " return [tok.lower() for tok in listOfTokens if len(tok) > 2]\n" ] }, { "cell_type": "code", "execution_count": 200, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def spamTest():\n", " '''\n", " Desc:\n", " 对贝叶斯垃圾邮件分类器进行自动化处理。\n", " Args:\n", " none\n", " Returns:\n", " 对测试集中的每封邮件进行分类,若邮件分类错误,则错误数加 1,最后返回总的错误百分比。\n", " '''\n", " docList = []\n", " classList = []\n", " fullText = []\n", " for i in range(1, 26):\n", " # 切分,解析数据,并归类为 1 类别\n", " wordList = textParse(open('email/spam/%d.txt' % i).read())\n", " docList.append(wordList)\n", " classList.append(1)\n", " # 切分,解析数据,并归类为 0 类别\n", " wordList = textParse(open('email/ham/%d.txt' % i).read())\n", " docList.append(wordList)\n", " fullText.extend(wordList)\n", " classList.append(0)\n", " # 创建词汇表 \n", " vocabList = createVocabList(docList)\n", " trainingSet = list(range(50))\n", " testSet = []\n", " # 随机取 10 个邮件用来测试\n", " for i in range(10):\n", " # random.uniform(x, y) 随机生成一个范围为 x - y 的实数\n", " randIndex = int(random.uniform(0, len(trainingSet)))\n", " testSet.append(trainingSet[randIndex])\n", " del(trainingSet[randIndex])\n", " trainMat = []\n", " trainClasses = []\n", " for docIndex in trainingSet:\n", " trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))\n", " trainClasses.append(classList[docIndex])\n", " p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))\n", " errorCount = 0\n", " for docIndex in testSet:\n", " wordVector = setOfWords2Vec(vocabList, docList[docIndex])\n", " if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:\n", " errorCount += 1\n", " print ('the errorCount is: ', errorCount)\n", " print ('the testSet length is :', len(testSet))\n", " print ('the error rate is :', float(errorCount)/len(testSet))" ] }, { "cell_type": "code", "execution_count": 201, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "the errorCount is: 0\n", "the testSet length is : 10\n", "the error rate is : 0.0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/kyzhang/anaconda/lib/python3.6/re.py:212: FutureWarning: split() requires a non-empty pattern match.\n", " return _compile(pattern, flags).split(string, maxsplit)\n" ] } ], "source": [ "spamTest()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 示例3 使用贝叶斯分类器从个人广告中获取区域倾向" ] }, { "cell_type": "code", "execution_count": 202, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 将文本文件解析成 词条向量\n", "def setOfWords2VecMN(vocabList,inputSet):\n", " returnVec=[0]*len(vocabList) # 创建一个其中所含元素都为0的向量\n", " for word in inputSet:\n", " if word in vocabList:\n", " returnVec[vocabList.index(word)]+=1\n", " return returnVec" ] }, { "cell_type": "code", "execution_count": 203, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#文件解析\n", "def textParse(bigString):\n", " import re\n", " listOfTokens=re.split(r'\\W*', bigString)\n", " return [tok.lower() for tok in listOfTokens if len(tok)>2]" ] }, { "cell_type": "code", "execution_count": 204, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#RSS源分类器及高频词去除函数\n", "def calcMostFreq(vocabList,fullText):\n", " import operator\n", " freqDict={}\n", " for token in vocabList: #遍历词汇表中的每个词\n", " freqDict[token]=fullText.count(token) #统计每个词在文本中出现的次数\n", " sortedFreq=sorted(freqDict.items(),key=operator.itemgetter(1),reverse=True) #根据每个词出现的次数从高到底对字典进行排序\n", " return sortedFreq[:30] #返回出现次数最高的30个单词" ] }, { "cell_type": "code", "execution_count": 205, "metadata": {}, "outputs": [], "source": [ "def localWords(feed1,feed0):\n", " import feedparser\n", " docList=[];classList=[];fullText=[]\n", " minLen=min(len(feed1['entries']),len(feed0['entries']))\n", " for i in range(minLen):\n", " wordList=textParse(feed1['entries'][i]['summary']) #每次访问一条RSS源\n", " docList.append(wordList)\n", " fullText.extend(wordList)\n", " classList.append(1)\n", " wordList=textParse(feed0['entries'][i]['summary'])\n", " docList.append(wordList)\n", " fullText.extend(wordList)\n", " classList.append(0)\n", " vocabList=createVocabList(docList)\n", " top30Words=calcMostFreq(vocabList,fullText)\n", " for pairW in top30Words:\n", " if pairW[0] in vocabList:vocabList.remove(pairW[0]) #去掉出现次数最高的那些词\n", " trainingSet=list(range(2*minLen));testSet=[]\n", " for i in range(20):\n", " randIndex=int(random.uniform(0,len(trainingSet)))\n", " testSet.append(trainingSet[randIndex])\n", " del(trainingSet[randIndex])\n", " trainMat=[];trainClasses=[]\n", " for docIndex in trainingSet:\n", " trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))\n", " trainClasses.append(classList[docIndex])\n", " p0V,p1V,pSpam=trainNB0(array(trainMat),array(trainClasses))\n", " errorCount=0\n", " for docIndex in testSet:\n", " wordVector=bagOfWords2VecMN(vocabList,docList[docIndex])\n", " if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:\n", " errorCount+=1\n", " print ('the error rate is:',float(errorCount)/len(testSet))\n", " return vocabList,p0V,p1V" ] }, { "cell_type": "code", "execution_count": 206, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "the error rate is: 0.4\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/kyzhang/anaconda/lib/python3.6/re.py:212: FutureWarning: split() requires a non-empty pattern match.\n", " return _compile(pattern, flags).split(string, maxsplit)\n" ] } ], "source": [ "import feedparser\n", "ny = feedparser.parse('http://newyork.craiglist.org/stp/index.rss')\n", "sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')\n", "vocabList,pSF,pNY = localWords(ny, sf)" ] }, { "cell_type": "code", "execution_count": 207, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 最具表征性的词汇显示函数\n", "def getTopWords(ny,sf):\n", " import operator\n", " vocabList,p0V,p1V=localWords(ny,sf)\n", " topNY=[];topSF=[]\n", " for i in range(len(p0V)):\n", " if p0V[i]>-6.0:topSF.append((vocabList[i],p0V[i]))\n", " if p1V[i]>-6.0:topNY.append((vocabList[i],p1V[i]))\n", " sortedSF=sorted(topSF,key=lambda pair:pair[1],reverse=True)\n", " print (\"SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**\")\n", " for item in sortedSF:\n", " print (item[0])\n", " sortedNY=sorted(topNY,key=lambda pair:pair[1],reverse=True)\n", " print (\"NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**\")\n", " for item in sortedNY:\n", " print (item[0])" ] }, { "cell_type": "code", "execution_count": 208, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/kyzhang/anaconda/lib/python3.6/re.py:212: FutureWarning: split() requires a non-empty pattern match.\n", " return _compile(pattern, flags).split(string, maxsplit)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "the error rate is: 0.35\n", "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**\n", "work\n", "married\n", "asian\n", "hit\n", "buddy\n", "get\n", "hang\n", "down\n", "cool\n", "please\n", "oakland\n", "body\n", "activities\n", "drama\n", "girl\n", "they\n", "head\n", "hello\n", "any\n", "from\n", "break\n", "old\n", "area\n", "white\n", "both\n", "attractive\n", "couple\n", "now\n", "mature\n", "years\n", "there\n", "all\n", "around\n", "see\n", "expectations\n", "friend\n", "something\n", "brown\n", "lookin\n", "here\n", "maybe\n", "seeking\n", "girls\n", "has\n", "fun\n", "were\n", "meet\n", "american\n", "was\n", "scorpios\n", "business\n", "therapy\n", "build\n", "150\n", "will\n", "racial\n", "require\n", "than\n", "give\n", "sebastopol\n", "best\n", "teasing\n", "exploring\n", "hands\n", "picture\n", "other\n", "things\n", "enjoy\n", "friendship\n", "40s\n", "together\n", "visiting\n", "fresh\n", "week\n", "occupational\n", "boob\n", "job\n", "relationship\n", "moved\n", "got\n", "circle\n", "bbw\n", "sperm\n", "dance\n", "home\n", "favor\n", "year\n", "had\n", "untold\n", "ready\n", "hold\n", "start\n", "east\n", "hmu\n", "showings\n", "sometimes\n", "however\n", "always\n", "gay\n", "apart\n", "men\n", "caring\n", "fit\n", "sjsu\n", "reason\n", "burner\n", "lately\n", "great\n", "night\n", "medical\n", "outside\n", "dude\n", "online\n", "single\n", "when\n", "crime\n", "swimming\n", "strings\n", "thousands\n", "normal\n", "full\n", "pleasurable\n", "nerdy\n", "saying\n", "harmless\n", "wondering\n", "goes\n", "wanna\n", "since\n", "hard\n", "straight\n", "latina\n", "peps\n", "3rd\n", "car\n", "want\n", "seem\n", "level\n", "much\n", "read\n", "distraction\n", "control\n", "chemistry\n", "odds\n", "life\n", "82kg\n", "dinner\n", "specific\n", "gentleman\n", "public\n", "share\n", "first\n", "sucks\n", "right\n", "spots\n", "boyfriend\n", "stupid\n", "oysters\n", "benefits\n", "hot\n", "include\n", "lbs\n", "guys\n", "ladies\n", "sexy\n", "rewards\n", "simply\n", "pittsburg\n", "feel\n", "hotel\n", "partner\n", "zuno565\n", "moutain\n", "private\n", "summary\n", "stress\n", "film\n", "rocky\n", "been\n", "clean\n", "little\n", "lean\n", "one\n", "kik\n", "black\n", "open\n", "eyes\n", "toward\n", "consider\n", "don\n", "real\n", "without\n", "happy\n", "parties\n", "rialto\n", "being\n", "ethnicity\n", "mix\n", "after\n", "woman\n", "away\n", "disease\n", "kind\n", "compensate\n", "flash\n", "also\n", "freaks\n", "everyone\n", "unconventional\n", "37f\n", "depending\n", "female\n", "handle\n", "hair\n", "hey\n", "once\n", "dont\n", "male\n", "present\n", "companionship\n", "myself\n", "attraction\n", "routine\n", "beaches\n", "professional\n", "180lbs\n", "rushing\n", "her\n", "uninhibited\n", "convo\n", "degree\n", "book\n", "anyo\n", "mex\n", "enjoyment\n", "comfort\n", "weight\n", "slim\n", "bay\n", "francisco\n", "executive\n", "skinny\n", "try\n", "swimmer\n", "younger\n", "know\n", "party\n", "token\n", "location\n", "puss\n", "lot\n", "living\n", "possibly\n", "donor\n", "laugh\n", "respectful\n", "currently\n", "assets\n", "san\n", "fucked\n", "join\n", "while\n", "how\n", "commitment\n", "acting\n", "free\n", "drinks\n", "safe\n", "goodbye\n", "trying\n", "attention\n", "movies\n", "craigslist\n", "caregivers\n", "mutual\n", "otherwise\n", "height\n", "etc\n", "those\n", "stop\n", "nothing\n", "198cm\n", "likes\n", "submissive\n", "think\n", "make\n", "ill\n", "respond\n", "grow\n", "indian\n", "romantic\n", "graduated\n", "types\n", "monotony\n", "early\n", "coffee\n", "five\n", "9pm\n", "yonkers\n", "strange\n", "must\n", "because\n", "then\n", "comfortable\n", "oral\n", "expiring\n", "race\n", "treat\n", "aggression\n", "hormone\n", "type\n", "serious\n", "which\n", "specialize\n", "very\n", "defenseless\n", "few\n", "panty\n", "onl\n", "rom\n", "issue\n", "scream\n", "pre\n", "located\n", "latin\n", "everything\n", "womans\n", "cute\n", "part\n", "most\n", "fatass\n", "thick\n", "point\n", "dining\n", "yawn\n", "cut\n", "civilizations\n", "regarding\n", "low\n", "feeling\n", "delicious\n", "dinners\n", "nasty\n", "traveling\n", "gentle\n", "haven\n", "partying\n", "detailed\n", "beautiful\n", "contact\n", "your\n", "already\n", "fck\n", "lov\n", "upfront\n", "focus\n", "lady\n", "she\n", "boys\n", "nights\n", "answer\n", "ankle\n", "massage\n", "str8\n", "provide\n", "hosting\n", "wash\n", "his\n", "avoid\n", "easily\n", "blk\n", "games\n", "speak\n", "20s\n", "personality\n", "result\n", "okay\n", "styling\n", "places\n", "genuine\n", "near\n", "animals\n", "287\n", "same\n", "consuming\n", "sloppy\n", "smart\n", "times\n", "totally\n", "museums\n", "appointment\n", "30s\n", "females\n", "lets\n", "kevin\n", "find\n", "studio\n", "problem\n", "help\n", "music\n", "meeting\n", "sma\n", "only\n", "before\n", "intelligent\n", "mid\n", "february\n", "offbeat\n", "wry\n", "name\n", "model\n", "sexual\n", "lease\n", "stats\n", "connect\n", "through\n", "refined\n", "repeat\n", "next\n", "could\n", "hate\n", "made\n", "state\n", "cant\n", "howdy\n", "abuse\n", "excuse\n", "anything\n", "bit\n", "sit\n", "loves\n", "cafes\n", "pic\n", "town\n", "hilarious\n", "ongoing\n", "37yo\n", "cuddling\n", "take\n", "did\n", "curious\n", "skin\n", "artist\n", "sex\n", "able\n", "ccsf\n", "articulate\n", "cultural\n", "usually\n", "mushroom\n", "smoke\n", "place\n", "special\n", "mean\n", "animes\n", "vernon\n", "front\n", "aggressive\n", "ever\n", "interested\n", "iam\n", "kids\n", "alone\n", "weird\n", "super\n", "wouldn\n", "self\n", "basic\n", "checked\n", "send\n", "drummer\n", "major\n", "wanting\n", "familiarity\n", "crave\n", "well\n", "ass\n", "exist\n", "sautéed\n", "manhattan\n", "chair\n", "humor\n", "status\n", "chance\n", "couch\n", "lowkey\n", "dick\n", "tattoos\n", "strong\n", "philosophy\n", "offer\n", "makin\n", "sedate\n", "myths\n", "live\n", "transgender\n", "conceivable\n", "fantasies\n", "save\n", "doesn\n", "tied\n", "dog\n", "guess\n", "having\n", "let\n", "actually\n", "eat\n", "antisocial\n", "lunch\n", "over\n", "chances\n", "moan\n", "stay\n", "panties\n", "bottoming\n", "scratching\n", "lifestyle\n", "today\n", "enough\n", "picky\n", "our\n", "perspective\n", "conversation\n", "another\n", "origina\n", "educated\n", "9am\n", "nyc\n", "inch\n", "eight\n", "abusive\n", "coms\n", "fungus\n", "happily\n", "onions\n", "mountains\n", "email\n", "questions\n", "getting\n", "strike\n", "puke\n", "reply\n", "busy\n", "milf\n", "even\n", "verbally\n", "lonely\n", "slightly\n", "parks\n", "classy\n", "although\n", "spank\n", "zero\n", "anyone\n", "port\n", "desperate\n", "date\n", "requiring\n", "people\n", "remaining\n", "about\n", "city\n", "age\n", "host\n", "replacement\n", "masculine\n", "sci\n", "steal\n", "reading\n", "ancient\n", "brute\n", "bottom\n", "man\n", "return\n", "weekends\n", "mind\n", "extremely\n", "funny\n", "minded\n", "might\n", "conveniently\n", "where\n", "smooth\n", "excites\n", "information\n", "hobbies\n", "backstory\n", "trans\n", "horny\n", "went\n", "into\n", "move\n", "top\n", "talk\n", "put\n", "beat\n", "hispanic\n", "pics\n", "sense\n", "vgl\n", "cuddle\n", "play\n", "isn\n", "roads\n", "hand\n", "nature\n", "cuffed\n", "leaves\n", "rough\n", "cry\n", "6530\n", "food\n", "close\n", "bear\n", "house\n", "emails\n", "income\n", "marches\n", "back\n", "daddy\n", "june\n", "satisfy\n", "biz\n", "dudesdnt\n", "beach\n", "ive\n", "quiet\n", "shape\n", "brought\n", "current\n", "matter\n", "tall\n", "size\n", "chat\n", "fear\n", "taken\n", "money\n", "needs\n", "gross\n", "these\n", "travel\n", "hamper\n", "facing\n", "hanging\n", "lost\n", "cliche\n", "soon\n", "realized\n", "edm\n", "haircutting\n", "deets\n", "often\n", "lame\n", "search\n", "person\n", "own\n", "every\n", "tag\n", "heads\n", "care\n", "nice\n", "347\n", "chester\n", "dominate\n", "input\n", "lol\n", "room\n", "dates\n", "come\n", "thing\n", "prefer\n", "laid\n", "many\n", "message\n", "hung\n", "420\n", "numerous\n", "tales\n", "answers\n", "isolate\n", "sad\n", "month\n", "discreet\n", "events\n", "outgoing\n", "beauty\n", "mwm\n", "artistic\n", "wants\n", "similar\n", "fair\n", "fungi\n", "really\n", "arts\n", "hopefully\n", "sound\n", "chill\n", "big\n", "muscleboy\n", "them\n", "plus\n", "penis\n", "std\n", "loud\n", "should\n", "going\n", "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**\n", "very\n", "make\n", "really\n", "most\n", "times\n", "old\n", "get\n", "been\n", "myself\n", "type\n", "serious\n", "moved\n", "your\n", "year\n", "however\n", "find\n", "they\n", "could\n", "mean\n", "now\n", "little\n", "eight\n", "don\n", "host\n", "back\n", "size\n", "travel\n", "down\n", "dominate\n", "five\n", "therapy\n", "yonkers\n", "strange\n", "must\n", "will\n", "because\n", "oral\n", "hormone\n", "please\n", "which\n", "few\n", "onl\n", "issue\n", "scream\n", "pre\n", "hands\n", "everything\n", "womans\n", "cute\n", "other\n", "fatass\n", "thick\n", "point\n", "yawn\n", "cut\n", "regarding\n", "low\n", "feeling\n", "delicious\n", "body\n", "hit\n", "nasty\n", "relationship\n", "traveling\n", "activities\n", "detailed\n", "fck\n", "home\n", "upfront\n", "focus\n", "answer\n", "massage\n", "str8\n", "provide\n", "hosting\n", "avoid\n", "easily\n", "games\n", "speak\n", "personality\n", "result\n", "places\n", "hmu\n", "same\n", "consuming\n", "fit\n", "lately\n", "problem\n", "dude\n", "hello\n", "only\n", "before\n", "full\n", "model\n", "from\n", "refined\n", "repeat\n", "hate\n", "made\n", "state\n", "cant\n", "howdy\n", "anything\n", "sit\n", "ongoing\n", "seem\n", "curious\n", "artist\n", "able\n", "articulate\n", "usually\n", "mushroom\n", "life\n", "vernon\n", "aggressive\n", "alone\n", "weird\n", "wouldn\n", "self\n", "basic\n", "white\n", "wanting\n", "familiarity\n", "sautéed\n", "lowkey\n", "guys\n", "sexy\n", "strong\n", "offer\n", "feel\n", "conceivable\n", "save\n", "couple\n", "dog\n", "actually\n", "antisocial\n", "over\n", "moan\n", "lifestyle\n", "enough\n", "picky\n", "perspective\n", "nyc\n", "inch\n", "black\n", "abusive\n", "open\n", "fungus\n", "onions\n", "email\n", "questions\n", "years\n", "real\n", "puke\n", "all\n", "busy\n", "even\n", "verbally\n", "lonely\n", "being\n", "although\n", "zero\n", "anyone\n", "requiring\n", "remaining\n", "about\n", "city\n", "female\n", "replacement\n", "masculine\n", "weekends\n", "mind\n", "minded\n", "might\n", "where\n", "information\n", "trans\n", "move\n", "talk\n", "hispanic\n", "see\n", "play\n", "comfort\n", "friend\n", "something\n", "skinny\n", "cry\n", "6530\n", "try\n", "food\n", "know\n", "bear\n", "house\n", "daddy\n", "living\n", "laugh\n", "june\n", "satisfy\n", "dudesdnt\n", "ive\n", "tall\n", "money\n", "needs\n", "gross\n", "these\n", "here\n", "soon\n", "realized\n", "while\n", "free\n", "has\n", "fun\n", "tag\n", "347\n", "movies\n", "input\n", "otherwise\n", "meet\n", "laid\n", "likes\n", "submissive\n", "message\n", "numerous\n", "isolate\n", "sad\n", "discreet\n", "outgoing\n", "beauty\n", "fair\n", "fungi\n", "muscleboy\n", "them\n", "penis\n", "going\n", "coffee\n", "scorpios\n", "9pm\n", "business\n", "build\n", "150\n", "then\n", "racial\n", "comfortable\n", "expiring\n", "race\n", "treat\n", "aggression\n", "asian\n", "require\n", "than\n", "give\n", "specialize\n", "defenseless\n", "panty\n", "rom\n", "sebastopol\n", "best\n", "teasing\n", "exploring\n", "located\n", "latin\n", "picture\n", "part\n", "things\n", "enjoy\n", "friendship\n", "40s\n", "together\n", "dining\n", "visiting\n", "fresh\n", "week\n", "oakland\n", "civilizations\n", "dinners\n", "occupational\n", "boob\n", "job\n", "gentle\n", "haven\n", "got\n", "circle\n", "bbw\n", "partying\n", "beautiful\n", "contact\n", "sperm\n", "already\n", "lov\n", "dance\n", "lady\n", "she\n", "favor\n", "boys\n", "nights\n", "ankle\n", "wash\n", "his\n", "had\n", "untold\n", "blk\n", "ready\n", "20s\n", "hold\n", "okay\n", "start\n", "styling\n", "east\n", "genuine\n", "near\n", "animals\n", "drama\n", "showings\n", "sometimes\n", "287\n", "always\n", "sloppy\n", "gay\n", "smart\n", "totally\n", "museums\n", "appointment\n", "apart\n", "30s\n", "men\n", "caring\n", "sjsu\n", "females\n", "reason\n", "lets\n", "burner\n", "kevin\n", "great\n", "night\n", "studio\n", "medical\n", "outside\n", "online\n", "help\n", "music\n", "girl\n", "meeting\n", "single\n", "head\n", "sma\n", "when\n", "crime\n", "swimming\n", "intelligent\n", "strings\n", "thousands\n", "mid\n", "february\n", "normal\n", "offbeat\n", "wry\n", "pleasurable\n", "nerdy\n", "name\n", "any\n", "saying\n", "sexual\n", "harmless\n", "lease\n", "wondering\n", "goes\n", "wanna\n", "since\n", "stats\n", "connect\n", "break\n", "through\n", "hard\n", "straight\n", "latina\n", "next\n", "peps\n", "3rd\n", "car\n", "abuse\n", "excuse\n", "bit\n", "loves\n", "cafes\n", "pic\n", "want\n", "town\n", "hilarious\n", "level\n", "much\n", "37yo\n", "read\n", "cuddling\n", "take\n", "did\n", "distraction\n", "area\n", "skin\n", "sex\n", "control\n", "chemistry\n", "odds\n", "ccsf\n", "cultural\n", "smoke\n", "place\n", "special\n", "82kg\n", "dinner\n", "animes\n", "specific\n", "front\n", "buddy\n", "gentleman\n", "public\n", "ever\n", "interested\n", "share\n", "first\n", "iam\n", "kids\n", "sucks\n", "right\n", "super\n", "spots\n", "boyfriend\n", "stupid\n", "checked\n", "send\n", "drummer\n", "major\n", "work\n", "both\n", "crave\n", "well\n", "oysters\n", "benefits\n", "ass\n", "exist\n", "hot\n", "manhattan\n", "chair\n", "humor\n", "include\n", "lbs\n", "status\n", "chance\n", "couch\n", "dick\n", "tattoos\n", "attractive\n", "ladies\n", "rewards\n", "philosophy\n", "makin\n", "sedate\n", "myths\n", "live\n", "simply\n", "pittsburg\n", "transgender\n", "fantasies\n", "hotel\n", "doesn\n", "married\n", "tied\n", "partner\n", "guess\n", "having\n", "let\n", "zuno565\n", "eat\n", "lunch\n", "moutain\n", "chances\n", "stay\n", "private\n", "summary\n", "panties\n", "stress\n", "mature\n", "film\n", "hang\n", "bottoming\n", "scratching\n", "rocky\n", "today\n", "clean\n", "our\n", "lean\n", "one\n", "conversation\n", "another\n", "origina\n", "educated\n", "9am\n", "kik\n", "eyes\n", "coms\n", "toward\n", "happily\n", "mountains\n", "consider\n", "there\n", "getting\n", "strike\n", "without\n", "happy\n", "reply\n", "milf\n", "around\n", "parties\n", "rialto\n", "slightly\n", "parks\n", "ethnicity\n", "classy\n", "mix\n", "after\n", "spank\n", "woman\n", "away\n", "disease\n", "port\n", "kind\n", "compensate\n", "desperate\n", "date\n", "people\n", "flash\n", "also\n", "freaks\n", "everyone\n", "unconventional\n", "37f\n", "age\n", "depending\n", "handle\n", "hair\n", "sci\n", "hey\n", "steal\n", "reading\n", "ancient\n", "once\n", "brute\n", "bottom\n", "man\n", "return\n", "dont\n", "extremely\n", "funny\n", "male\n", "present\n", "companionship\n", "attraction\n", "conveniently\n", "routine\n", "beaches\n", "professional\n", "smooth\n", "excites\n", "180lbs\n", "rushing\n", "her\n", "uninhibited\n", "convo\n", "hobbies\n", "backstory\n", "horny\n", "degree\n", "went\n", "book\n", "anyo\n", "into\n", "top\n", "mex\n", "put\n", "beat\n", "pics\n", "sense\n", "vgl\n", "cuddle\n", "expectations\n", "enjoyment\n", "isn\n", "roads\n", "hand\n", "nature\n", "cuffed\n", "weight\n", "slim\n", "leaves\n", "bay\n", "rough\n", "francisco\n", "executive\n", "swimmer\n", "close\n", "younger\n", "party\n", "token\n", "location\n", "emails\n", "puss\n", "income\n", "marches\n", "lot\n", "possibly\n", "donor\n", "biz\n", "beach\n", "respectful\n", "quiet\n", "shape\n", "brought\n", "current\n", "currently\n", "matter\n", "chat\n", "fear\n", "taken\n", "brown\n", "assets\n", "lookin\n", "san\n", "fucked\n", "hamper\n", "facing\n", "hanging\n", "join\n", "lost\n", "cliche\n", "maybe\n", "edm\n", "seeking\n", "how\n", "haircutting\n", "commitment\n", "acting\n", "deets\n", "often\n", "lame\n", "search\n", "drinks\n", "girls\n", "person\n", "safe\n", "own\n", "goodbye\n", "trying\n", "every\n", "heads\n", "attention\n", "care\n", "nice\n", "craigslist\n", "caregivers\n", "chester\n", "were\n", "mutual\n", "lol\n", "room\n", "dates\n", "height\n", "etc\n", "those\n", "stop\n", "nothing\n", "come\n", "cool\n", "thing\n", "prefer\n", "198cm\n", "many\n", "hung\n", "think\n", "420\n", "tales\n", "answers\n", "month\n", "ill\n", "events\n", "mwm\n", "respond\n", "artistic\n", "wants\n", "grow\n", "american\n", "similar\n", "arts\n", "hopefully\n", "sound\n", "chill\n", "big\n", "indian\n", "romantic\n", "was\n", "plus\n", "graduated\n", "types\n", "std\n", "loud\n", "should\n", "monotony\n", "early\n" ] } ], "source": [ "getTopWords(ny, sf)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }