{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 获取数据" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "data_folder = os.path.join(\".\", \"data\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# %load getdata.py\n", "# Downloads the books and stores them in the below folder\n", "import os\n", "from time import sleep\n", "import urllib.request\n", "\n", "titles = {}\n", "\n", "\n", "titles['burton'] = [4657, 2400, 5760, 6036, 7111, 8821,\n", " 18506, 4658, 5761, 6886, 7113]\n", "titles['dickens'] = [24022, 1392, 1414, 1467, 2324, 580,\n", " 786, 888, 963, 27924, 1394, 1415, 15618,\n", " 25985, 588, 807, 914, 967, 30127, 1400,\n", " 1421, 16023, 28198, 644, 809, 917, 968, 1023,\n", " 1406, 1422, 17879, 30368, 675, 810, 924, 98,\n", " 1289, 1413, 1423, 17880, 32241, 699, 821, 927]\n", "titles['doyle'] = [2349, 11656, 1644, 22357, 2347, 290, 34627, 5148,\n", " 8394, 26153, 12555, 1661, 23059, 2348, 294, 355,\n", " 5260, 8727, 10446, 126, 17398, 2343, 2350, 3070,\n", " 356, 5317, 903, 10581, 13152, 2038, 2344, 244, 32536,\n", " 423, 537, 108, 139, 2097, 2345, 24951, 32777, 4295,\n", " 7964, 11413, 1638, 21768, 2346, 2845, 3289, 439, 834]\n", "titles['gaboriau'] = [1748, 1651, 2736, 3336, 4604, 4002, 2451,\n", " 305, 3802, 547]\n", "titles['nesbit'] = [34219, 23661, 28804, 4378, 778, 20404, 28725,\n", " 33028, 4513, 794]\n", "titles['tarkington'] = [1098, 15855, 1983, 297, 402, 5798,\n", " 8740, 980, 1158, 1611, 2326, 30092,\n", " 483, 5949, 8867, 13275, 18259, 2595,\n", " 3428, 5756, 6401, 9659]\n", "titles['twain'] = [1044, 1213, 245, 30092, 3176, 3179, 3183, 3189, 74,\n", " 86, 1086, 142, 2572, 3173, 3177, 3180, 3186, 3192,\n", " 76, 91, 119, 1837, 2895, 3174, 3178, 3181, 3187, 3432,\n", " 8525]\n", "\n", "\n", "\n", "assert len(titles) == 7\n", "\n", "assert len(titles['tarkington']) == 22\n", "assert len(titles['dickens']) == 44\n", "assert len(titles['nesbit']) == 10\n", "assert len(titles['doyle']) == 51\n", "assert len(titles['twain']) == 29\n", "assert len(titles['burton']) == 11\n", "assert len(titles['gaboriau']) == 10\n", "\n", "\n", "# https://www.gutenberg.org\n", "#url_base = \"http://gutenberg.pglaf.org/\"\n", "#url_base = \"http://gutenberg.readingroo.ms/\"\n", "url_base = \"http://www.gutenberg.myebook.bg/\"\n", "url_format = \"{url_base}{idstring}/{id}/{id}.txt\"\n", "\n", "fixes = {}\n", "fixes[1044] = url_base + \"1/0/4/1044/1044-0.txt\"\n", "fixes[5148] = url_base + \"5/1/4/5148/5148-0.txt\"\n", "fixes[4657] = \"https://archive.org/stream/personalnarrativ04657gut/pnpa110.txt\"\n", "fixes[1467] = \"https://archive.org/stream/somechristmassto01467gut/cdscs10p_djvu.txt\"\n", "\n", "# Make parent folder if not exists\n", "if not os.path.exists(data_folder):\n", " os.makedirs(data_folder)\n", "\n", "for author in titles:\n", " print(\"Downloading titles from {author}\".format(author=author))\n", " # Make author's folder if not exists\n", " author_folder = os.path.join(data_folder, author)\n", " if not os.path.exists(author_folder):\n", " os.makedirs(author_folder)\n", " # Download each title to this folder\n", " for bookid in titles[author]:\n", " if bookid in fixes:\n", " print(\" - Applying fix to book with id {id}\".format(id=bookid))\n", " url = fixes[bookid]\n", " else:\n", " print(\" - Getting book with id {id}\".format(id=bookid))\n", " idstring = \"/\".join([str(bookid)[i] for i in range(len(str(bookid))-1)])\n", " print(bookid, idstring)\n", " url = url_format.format(url_base=url_base, idstring=idstring, id=bookid)\n", " print(\" - \" + url)\n", " filename = os.path.join(author_folder, \"{id}.txt\".format(id=bookid))\n", " if os.path.exists(filename):\n", " print(\" - File already exists, skipping\")\n", " else:\n", " urllib.request.urlretrieve(url, filename)\n", " sleep(60*5)\n", "print(\"Download complete\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def clean_book(document):\n", " lines = document.split(\"\\n\")\n", " start= 0\n", " end = len(lines)\n", " for i in range(len(lines)):\n", " line = lines[i]\n", " if line.startswith(\"*** START OF THIS PROJECT GUTENBERG\"):\n", " start = i + 1\n", " elif line.startswith(\"*** END OF THIS PROJECT GUTENBERG\"):\n", " end = i - 1\n", " return \"\\n\".join(lines[start:end])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "def load_books_data(folder=data_folder):\n", " documents = []\n", " authors = []\n", " subfolders = [subfolder for subfolder in os.listdir(folder)\n", " if os.path.isdir(os.path.join(folder, subfolder))]\n", " for author_number, subfolder in enumerate(subfolders):\n", " full_subfolder_path = os.path.join(folder, subfolder)\n", " for document_name in os.listdir(full_subfolder_path):\n", " with open(os.path.join(full_subfolder_path, document_name)) as inf:\n", " documents.append(clean_book(inf.read()))\n", " authors.append(author_number)\n", " return documents, np.array(authors, dtype='int')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "documents, classes = load_books_data(data_folder)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "function_words = [\"a\", \"able\", \"aboard\", \"about\", \"above\", \"absent\",\n", " \"according\" , \"accordingly\", \"across\", \"after\", \"against\",\n", " \"ahead\", \"albeit\", \"all\", \"along\", \"alongside\", \"although\",\n", " \"am\", \"amid\", \"amidst\", \"among\", \"amongst\", \"amount\", \"an\",\n", " \"and\", \"another\", \"anti\", \"any\", \"anybody\", \"anyone\",\n", " \"anything\", \"are\", \"around\", \"as\", \"aside\", \"astraddle\",\n", " \"astride\", \"at\", \"away\", \"bar\", \"barring\", \"be\", \"because\",\n", " \"been\", \"before\", \"behind\", \"being\", \"below\", \"beneath\",\n", " \"beside\", \"besides\", \"better\", \"between\", \"beyond\", \"bit\",\n", " \"both\", \"but\", \"by\", \"can\", \"certain\", \"circa\", \"close\",\n", " \"concerning\", \"consequently\", \"considering\", \"could\",\n", " \"couple\", \"dare\", \"deal\", \"despite\", \"down\", \"due\", \"during\",\n", " \"each\", \"eight\", \"eighth\", \"either\", \"enough\", \"every\",\n", " \"everybody\", \"everyone\", \"everything\", \"except\", \"excepting\",\n", " \"excluding\", \"failing\", \"few\", \"fewer\", \"fifth\", \"first\",\n", " \"five\", \"following\", \"for\", \"four\", \"fourth\", \"from\", \"front\",\n", " \"given\", \"good\", \"great\", \"had\", \"half\", \"have\", \"he\",\n", " \"heaps\", \"hence\", \"her\", \"hers\", \"herself\", \"him\", \"himself\",\n", " \"his\", \"however\", \"i\", \"if\", \"in\", \"including\", \"inside\",\n", " \"instead\", \"into\", \"is\", \"it\", \"its\", \"itself\", \"keeping\",\n", " \"lack\", \"less\", \"like\", \"little\", \"loads\", \"lots\", \"majority\",\n", " \"many\", \"masses\", \"may\", \"me\", \"might\", \"mine\", \"minority\",\n", " \"minus\", \"more\", \"most\", \"much\", \"must\", \"my\", \"myself\",\n", " \"near\", \"need\", \"neither\", \"nevertheless\", \"next\", \"nine\",\n", " \"ninth\", \"no\", \"nobody\", \"none\", \"nor\", \"nothing\",\n", " \"notwithstanding\", \"number\", \"numbers\", \"of\", \"off\", \"on\",\n", " \"once\", \"one\", \"onto\", \"opposite\", \"or\", \"other\", \"ought\",\n", " \"our\", \"ours\", \"ourselves\", \"out\", \"outside\", \"over\", \"part\",\n", " \"past\", \"pending\", \"per\", \"pertaining\", \"place\", \"plenty\",\n", " \"plethora\", \"plus\", \"quantities\", \"quantity\", \"quarter\",\n", " \"regarding\", \"remainder\", \"respecting\", \"rest\", \"round\",\n", " \"save\", \"saving\", \"second\", \"seven\", \"seventh\", \"several\",\n", " \"shall\", \"she\", \"should\", \"similar\", \"since\", \"six\", \"sixth\",\n", " \"so\", \"some\", \"somebody\", \"someone\", \"something\", \"spite\",\n", " \"such\", \"ten\", \"tenth\", \"than\", \"thanks\", \"that\", \"the\",\n", " \"their\", \"theirs\", \"them\", \"themselves\", \"then\", \"thence\",\n", " \"therefore\", \"these\", \"they\", \"third\", \"this\", \"those\",\n", "\"though\", \"three\", \"through\", \"throughout\", \"thru\", \"thus\",\n", "\"till\", \"time\", \"to\", \"tons\", \"top\", \"toward\", \"towards\",\n", "\"two\", \"under\", \"underneath\", \"unless\", \"unlike\", \"until\",\n", "\"unto\", \"up\", \"upon\", \"us\", \"used\", \"various\", \"versus\",\n", "\"via\", \"view\", \"wanting\", \"was\", \"we\", \"were\", \"what\",\n", "\"whatever\", \"when\", \"whenever\", \"where\", \"whereas\",\n", "\"wherever\", \"whether\", \"which\", \"whichever\", \"while\",\n", " \"whilst\", \"who\", \"whoever\", \"whole\", \"whom\", \"whomever\",\n", "\"whose\", \"will\", \"with\", \"within\", \"without\", \"would\", \"yet\",\n", "\"you\", \"your\", \"yours\", \"yourself\", \"yourselves\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "extractor = CountVectorizer(vocabulary=function_words)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", "from sklearn.cross_validation import cross_val_score\n", "from sklearn.pipeline import Pipeline\n", "from sklearn import grid_search" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}\n", "svr = SVC()\n", "grid = grid_search.GridSearchCV(svr, parameters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline1 = Pipeline([('feature_extraction', extractor),\n", " ('clf', grid)\n", " ])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scores = cross_val_score(pipeline1, documents, classes,\n", "scoring='f1')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(np.mean(scores))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline([('feature_extraction',\n", "CountVectorizer(analyzer='char', ngram_range=(3, 3))),\n", "('classifier', grid)\n", "])\n", "scores = cross_val_score(pipeline, documents, classes,\n", "scoring='f1')\n", "print(\"Score: {:.3f}\".format(np.mean(scores)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "enron_data_folder = os.path.join(os.path.expanduser(\"~\"), \"Data\",\n", "\"enron_mail_20110402\", \"maildir\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from email.parser import Parser\n", "p = Parser()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.utils import check_random_state" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_enron_corpus(num_authors=10, data_folder=data_folder,\n", " min_docs_author=10, max_docs_author=100,\n", " random_state=None):\n", " random_state = check_random_state(random_state)\n", " email_addresses = sorted(os.listdir(data_folder))\n", " random_state.shuffle(email_addresses)\n", " documents = []\n", " classes = []\n", " author_num = 0\n", " authors = {}\n", " for user in email_addresses:\n", " users_email_folder = os.path.join(data_folder, user)\n", " mail_folders = [os.path.join(users_email_folder, subfolder)\n", " for subfolder in os.listdir(users_email_folder)\n", " if \"sent\" in subfolder]\n", " try:\n", " authored_emails = [open(os.path.join(mail_folder, email_filename), encoding='cp1252').read()\n", " for mail_folder in mail_folders\n", " for email_filename in os.listdir(mail_folder)]\n", " except IsADirectoryError:\n", " continue\n", " if len(authored_emails) < min_docs_author:\n", " continue\n", " if len(authored_emails) > max_docs_author:\n", " authored_emails = authored_emails[:max_docs_author]\n", " contents = [p.parsestr(email)._payload for email in authored_emails]\n", " documents.extend(contents)\n", " classes.extend([author_num] * len(authored_emails))\n", " authors[user] = author_num\n", " author_num += 1\n", " if author_num >= num_authors or author_num >= len(email_addresses):\n", " break\n", " return documents, np.array(classes), authors" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "documents, classes, authors = get_enron_corpus(data_folder=enron_data_folder, random_state=14)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "documents[100]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import quotequail" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def remove_replies(email_contents):\n", " r = quotequail.unwrap(email_contents)\n", " if r is None:\n", " return email_contents\n", " if 'text_top' in r:\n", " return r['text_top']\n", " elif 'text' in r:\n", " return r['text']\n", " return email_contents" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "documents = [remove_replies(document) for document in documents]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scores = cross_val_score(pipeline, documents, classes, scoring='f1')\n", "print(\"Score: {:.3f}\".format(np.mean(scores)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.cross_validation import train_test_split\n", "training_documents, testing_documents, y_train, y_test = train_test_split(documents, classes, random_state=14)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline.fit(training_documents, y_train)\n", "y_pred = pipeline.predict(testing_documents)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(pipeline.named_steps['classifier'].best_params_)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import confusion_matrix\n", "cm = confusion_matrix(y_pred, y_test)\n", "cm = cm / cm.astype(np.float).sum(axis=1)\n", "sorted_authors = sorted(authors.keys(), key=lambda x:authors[x])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "from matplotlib import pyplot as plt\n", "plt.figure(figsize=(30, 30))\n", "plt.imshow(cm, cmap='Blues')\n", "tick_marks = np.arange(len( sorted_authors ))\n", "plt.xticks(tick_marks, sorted_authors )\n", "plt.yticks(tick_marks, sorted_authors )\n", "plt.ylabel('Actual')\n", "plt.xlabel('Predicted')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "toc": { "colors": { "hover_highlight": "#DAA520", "navigate_num": "#000000", "navigate_text": "#333333", "running_highlight": "#FF0000", "selected_highlight": "#FFD700", "sidebar_border": "#EEEEEE", "wrapper_background": "#FFFFFF" }, "moveMenuLeft": true, "nav_menu": { "height": "36px", "width": "253px" }, "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 4, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false, "widenNotebook": false } }, "nbformat": 4, "nbformat_minor": 2 }