{ "metadata": { "name": "", "signature": "sha256:9d78c8eceed7efe108eaba36a2db0291335d2fd0a1ee5c0293367b55e04d68a9" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# The Bible" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/AV1611text.zip\n", "# if you need unzip.... !sudo apt-get install -y unzip\n", "!curl -O https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/AV1611text.zip\n", "!unzip AV1611text.zip" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\r\n", " Dload Upload Total Spent Left Speed\r\n", "\r", " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", "100 1430k 100 1430k 0 0 1588k 0 --:--:-- --:--:-- --:--:-- 1587k\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Archive: AV1611text.zip\r\n", " creating: AV1611text/\r\n", " inflating: AV1611text/.DS_Store \r\n", " creating: __MACOSX/\r\n", " creating: __MACOSX/AV1611text/\r\n", " inflating: __MACOSX/AV1611text/._.DS_Store \r\n", " inflating: AV1611text/1Chronicles.txt \r\n", " inflating: __MACOSX/AV1611text/._1Chronicles.txt \r\n", " inflating: AV1611text/1Corinthians.txt \r\n", " inflating: __MACOSX/AV1611text/._1Corinthians.txt \r\n", " inflating: AV1611text/1John.txt \r\n", " inflating: __MACOSX/AV1611text/._1John.txt \r\n", " inflating: AV1611text/1Kings.txt \r\n", " inflating: __MACOSX/AV1611text/._1Kings.txt \r\n", " inflating: AV1611text/1Peter.txt \r\n", " inflating: __MACOSX/AV1611text/._1Peter.txt \r\n", " inflating: AV1611text/1Samuel.txt \r\n", " inflating: __MACOSX/AV1611text/._1Samuel.txt \r\n", " inflating: AV1611text/1Thessalonians.txt \r\n", " inflating: __MACOSX/AV1611text/._1Thessalonians.txt \r\n", " inflating: AV1611text/1Timothy.txt \r\n", " inflating: __MACOSX/AV1611text/._1Timothy.txt \r\n", " inflating: AV1611text/2Chronicles.txt \r\n", " inflating: __MACOSX/AV1611text/._2Chronicles.txt \r\n", " inflating: AV1611text/2Corinthians.txt \r\n", " inflating: __MACOSX/AV1611text/._2Corinthians.txt \r\n", " inflating: AV1611text/2John.txt \r\n", " inflating: __MACOSX/AV1611text/._2John.txt \r\n", " inflating: AV1611text/2Kings.txt \r\n", " inflating: __MACOSX/AV1611text/._2Kings.txt \r\n", " inflating: AV1611text/2Peter.txt \r\n", " inflating: __MACOSX/AV1611text/._2Peter.txt \r\n", " inflating: AV1611text/2Samuel.txt \r\n", " inflating: __MACOSX/AV1611text/._2Samuel.txt \r\n", " inflating: AV1611text/2Thessalonians.txt \r\n", " inflating: __MACOSX/AV1611text/._2Thessalonians.txt \r\n", " inflating: AV1611text/2Timothy.txt \r\n", " inflating: __MACOSX/AV1611text/._2Timothy.txt \r\n", " inflating: AV1611text/3John.txt \r\n", " inflating: __MACOSX/AV1611text/._3John.txt \r\n", " inflating: AV1611text/Acts.txt \r\n", " inflating: __MACOSX/AV1611text/._Acts.txt \r\n", " inflating: AV1611text/Amos.txt \r\n", " inflating: __MACOSX/AV1611text/._Amos.txt \r\n", " inflating: AV1611text/Colossians.txt \r\n", " inflating: __MACOSX/AV1611text/._Colossians.txt \r\n", " inflating: AV1611text/Daniel.txt \r\n", " inflating: __MACOSX/AV1611text/._Daniel.txt \r\n", " inflating: AV1611text/Dedicatory.txt \r\n", " inflating: __MACOSX/AV1611text/._Dedicatory.txt \r\n", " inflating: AV1611text/Deuteronomy.txt \r\n", " inflating: __MACOSX/AV1611text/._Deuteronomy.txt \r\n", " inflating: AV1611text/Ecclesiastes.txt \r\n", " inflating: __MACOSX/AV1611text/._Ecclesiastes.txt \r\n", " inflating: AV1611text/Ephesians.txt \r\n", " inflating: __MACOSX/AV1611text/._Ephesians.txt \r\n", " inflating: AV1611text/Esther.txt \r\n", " inflating: __MACOSX/AV1611text/._Esther.txt \r\n", " inflating: AV1611text/Exodus.txt \r\n", " inflating: __MACOSX/AV1611text/._Exodus.txt \r\n", " inflating: AV1611text/Ezekiel.txt \r\n", " inflating: __MACOSX/AV1611text/._Ezekiel.txt \r\n", " inflating: AV1611text/Ezra.txt \r\n", " inflating: __MACOSX/AV1611text/._Ezra.txt \r\n", " inflating: AV1611text/Galatians.txt \r\n", " inflating: __MACOSX/AV1611text/._Galatians.txt \r\n", " inflating: AV1611text/Genesis.txt \r\n", " inflating: __MACOSX/AV1611text/._Genesis.txt \r\n", " inflating: AV1611text/Habakkuk.txt \r\n", " inflating: __MACOSX/AV1611text/._Habakkuk.txt \r\n", " inflating: AV1611text/Haggai.txt \r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " inflating: __MACOSX/AV1611text/._Haggai.txt \r\n", " inflating: AV1611text/Hebrews.txt \r\n", " inflating: __MACOSX/AV1611text/._Hebrews.txt \r\n", " inflating: AV1611text/Hosea.txt \r\n", " inflating: __MACOSX/AV1611text/._Hosea.txt \r\n", " inflating: AV1611text/Isaiah.txt \r\n", " inflating: __MACOSX/AV1611text/._Isaiah.txt \r\n", " inflating: AV1611text/James.txt \r\n", " inflating: __MACOSX/AV1611text/._James.txt \r\n", " inflating: AV1611text/Jeremiah.txt \r\n", " inflating: __MACOSX/AV1611text/._Jeremiah.txt \r\n", " inflating: AV1611text/Job.txt \r\n", " inflating: __MACOSX/AV1611text/._Job.txt \r\n", " inflating: AV1611text/Joel.txt \r\n", " inflating: __MACOSX/AV1611text/._Joel.txt \r\n", " inflating: AV1611text/John.txt \r\n", " inflating: __MACOSX/AV1611text/._John.txt \r\n", " inflating: AV1611text/Jonah.txt \r\n", " inflating: __MACOSX/AV1611text/._Jonah.txt \r\n", " inflating: AV1611text/Joshua.txt \r\n", " inflating: __MACOSX/AV1611text/._Joshua.txt \r\n", " inflating: AV1611text/Jude.txt \r\n", " inflating: __MACOSX/AV1611text/._Jude.txt \r\n", " inflating: AV1611text/Judges.txt \r\n", " inflating: __MACOSX/AV1611text/._Judges.txt \r\n", " inflating: AV1611text/Lamentations.txt \r\n", " inflating: __MACOSX/AV1611text/._Lamentations.txt \r\n", " inflating: AV1611text/Leviticus.txt \r\n", " inflating: __MACOSX/AV1611text/._Leviticus.txt \r\n", " inflating: AV1611text/Luke.txt \r\n", " inflating: __MACOSX/AV1611text/._Luke.txt \r\n", " inflating: AV1611text/Malachi.txt \r\n", " inflating: __MACOSX/AV1611text/._Malachi.txt \r\n", " inflating: AV1611text/Mark.txt \r\n", " inflating: __MACOSX/AV1611text/._Mark.txt \r\n", " inflating: AV1611text/Matthew.txt \r\n", " inflating: __MACOSX/AV1611text/._Matthew.txt \r\n", " inflating: AV1611text/Micah.txt \r\n", " inflating: __MACOSX/AV1611text/._Micah.txt \r\n", " inflating: AV1611text/Nahum.txt \r\n", " inflating: __MACOSX/AV1611text/._Nahum.txt \r\n", " inflating: AV1611text/Nehemiah.txt \r\n", " inflating: __MACOSX/AV1611text/._Nehemiah.txt \r\n", " inflating: AV1611text/Numbers.txt \r\n", " inflating: __MACOSX/AV1611text/._Numbers.txt \r\n", " inflating: AV1611text/Obadiah.txt \r\n", " inflating: __MACOSX/AV1611text/._Obadiah.txt \r\n", " inflating: AV1611text/Philemon.txt \r\n", " inflating: __MACOSX/AV1611text/._Philemon.txt \r\n", " inflating: AV1611text/Philippians.txt \r\n", " inflating: __MACOSX/AV1611text/._Philippians.txt \r\n", " inflating: AV1611text/Preface.txt \r\n", " inflating: __MACOSX/AV1611text/._Preface.txt \r\n", " inflating: AV1611text/Preface_w_footnotes.txt \r\n", " inflating: __MACOSX/AV1611text/._Preface_w_footnotes.txt \r\n", " inflating: AV1611text/Proverbs.txt \r\n", " inflating: __MACOSX/AV1611text/._Proverbs.txt \r\n", " inflating: AV1611text/Psalms.txt \r\n", " inflating: __MACOSX/AV1611text/._Psalms.txt \r\n", " inflating: AV1611text/Revelation.txt \r\n", " inflating: __MACOSX/AV1611text/._Revelation.txt \r\n", " inflating: AV1611text/Romans.txt \r\n", " inflating: __MACOSX/AV1611text/._Romans.txt \r\n", " inflating: AV1611text/Ruth.txt \r\n", " inflating: __MACOSX/AV1611text/._Ruth.txt \r\n", " inflating: AV1611text/SongofSolomon.txt \r\n", " inflating: __MACOSX/AV1611text/._SongofSolomon.txt \r\n", " inflating: AV1611text/Titus.txt \r\n", " inflating: __MACOSX/AV1611text/._Titus.txt \r\n", " inflating: AV1611text/Zechariah.txt \r\n", " inflating: __MACOSX/AV1611text/._Zechariah.txt \r\n", " inflating: AV1611text/Zephaniah.txt \r\n", " inflating: __MACOSX/AV1611text/._Zephaniah.txt \r\n", " inflating: __MACOSX/._AV1611text \r\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "!ls -l AV1611text/" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "total 9016\r\n", "-rwxr-xr-x 1 soma staff 116614 Nov 9 2003 \u001b[31m1Chronicles.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 51441 Nov 13 2003 \u001b[31m1Corinthians.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 13289 Nov 14 2003 \u001b[31m1John.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 132016 Nov 9 2003 \u001b[31m1Kings.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 14081 Nov 14 2003 \u001b[31m1Peter.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 134405 Nov 9 2003 \u001b[31m1Samuel.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 10247 Nov 13 2003 \u001b[31m1Thessalonians.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 13419 Nov 14 2003 \u001b[31m1Timothy.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 144096 Nov 9 2003 \u001b[31m2Chronicles.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 33679 Nov 13 2003 \u001b[31m2Corinthians.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 1641 Nov 14 2003 \u001b[31m2John.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 125212 Nov 9 2003 \u001b[31m2Kings.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 9063 Nov 14 2003 \u001b[31m2Peter.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 110670 Nov 9 2003 \u001b[31m2Samuel.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 5812 Nov 13 2003 \u001b[31m2Thessalonians.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 9842 Nov 14 2003 \u001b[31m2Timothy.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 1688 Nov 14 2003 \u001b[31m3John.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 134948 Nov 13 2003 \u001b[31mActs.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 22719 Nov 13 2003 \u001b[31mAmos.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 11333 Nov 13 2003 \u001b[31mColossians.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 64065 Nov 13 2003 \u001b[31mDaniel.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 5751 Feb 24 2004 \u001b[31mDedicatory.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 152225 Nov 5 2003 \u001b[31mDeuteronomy.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 29934 Nov 13 2003 \u001b[31mEcclesiastes.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 17136 Nov 13 2003 \u001b[31mEphesians.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 31308 Nov 9 2003 \u001b[31mEsther.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 176725 Nov 5 2003 \u001b[31mExodus.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 213314 Nov 13 2003 \u001b[31mEzekiel.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 42038 Nov 9 2003 \u001b[31mEzra.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 17067 Nov 13 2003 \u001b[31mGalatians.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 205748 Nov 5 2003 \u001b[31mGenesis.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 8304 Nov 13 2003 \u001b[31mHabakkuk.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 5987 Nov 13 2003 \u001b[31mHaggai.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 38903 Nov 14 2003 \u001b[31mHebrews.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 28373 Nov 13 2003 \u001b[31mHosea.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 202179 Nov 13 2003 \u001b[31mIsaiah.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 12711 Nov 14 2003 \u001b[31mJames.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 231838 Nov 13 2003 \u001b[31mJeremiah.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 100264 Nov 10 2003 \u001b[31mJob.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 11124 Nov 13 2003 \u001b[31mJoel.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 102311 Nov 13 2003 \u001b[31mJohn.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 6914 Nov 13 2003 \u001b[31mJonah.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 104045 Nov 5 2003 \u001b[31mJoshua.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 3672 Nov 14 2003 \u001b[31mJude.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 102418 Nov 5 2003 \u001b[31mJudges.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 19010 Nov 13 2003 \u001b[31mLamentations.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 132373 Nov 5 2003 \u001b[31mLeviticus.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 140547 Nov 13 2003 \u001b[31mLuke.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 9571 Nov 13 2003 \u001b[31mMalachi.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 82513 Nov 13 2003 \u001b[31mMark.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 129900 Nov 13 2003 \u001b[31mMatthew.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 16999 Nov 13 2003 \u001b[31mMicah.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 7217 Nov 13 2003 \u001b[31mNahum.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 59451 Nov 9 2003 \u001b[31mNehemiah.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 183087 Nov 5 2003 \u001b[31mNumbers.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 3727 Nov 13 2003 \u001b[31mObadiah.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 2533 Nov 14 2003 \u001b[31mPhilemon.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 12217 Nov 13 2003 \u001b[31mPhilippians.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 63160 Dec 1 2003 \u001b[31mPreface.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 67128 Nov 23 2003 \u001b[31mPreface_w_footnotes.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 85466 Nov 10 2003 \u001b[31mProverbs.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 243509 Nov 13 2003 \u001b[31mPsalms.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 64205 Nov 14 2003 \u001b[31mRevelation.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 52602 Nov 13 2003 \u001b[31mRomans.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 13561 Nov 6 2003 \u001b[31mRuth.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 14584 Nov 13 2003 \u001b[31mSongofSolomon.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 5512 Nov 14 2003 \u001b[31mTitus.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 34273 Nov 13 2003 \u001b[31mZechariah.txt\u001b[m\u001b[m\r\n", "-rwxr-xr-x 1 soma staff 8841 Nov 13 2003 \u001b[31mZephaniah.txt\u001b[m\u001b[m\r\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "import glob\n", "import pandas as pd\n", "\n", "paths = glob.glob(\"AV1611text/*\")\n", "\n", "books = []\n", "for path in paths:\n", " contents = open(path).read()\n", " # For fear of unicode\n", " contents = contents.decode(\"ascii\",\"ignore\")\n", " contents = contents.lower()\n", " \n", " book = {}\n", " book['contents'] = contents\n", " filename = path[11:]\n", " book['name'] = filename[:-4]\n", " books.append(book)\n", "\n", "books_df = pd.DataFrame(books)\n", "books_df.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
contentsname
0 the first book of the chronicles\\r\\n\\r\\nchapte... 1Chronicles
1 the first epistle of paul the apostle to the c... 1Corinthians
2 the first general epistle of john\\r\\n\\r\\nchapt... 1John
3 the first book of the kings\\r\\n\\r\\ncommonly ca... 1Kings
4 the first epistle general of peter\\r\\n\\r\\nchap... 1Peter
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 22, "text": [ " contents name\n", "0 the first book of the chronicles\\r\\n\\r\\nchapte... 1Chronicles\n", "1 the first epistle of paul the apostle to the c... 1Corinthians\n", "2 the first general epistle of john\\r\\n\\r\\nchapt... 1John\n", "3 the first book of the kings\\r\\n\\r\\ncommonly ca... 1Kings\n", "4 the first epistle general of peter\\r\\n\\r\\nchap... 1Peter" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# You'll need to change new_stopwords to 'english' if you haven't\n", "# run the code down below about nltk and creating a new stopwords list\n", "vectorizer = TfidfVectorizer(max_features=10000, stop_words=new_stopwords)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 45 }, { "cell_type": "code", "collapsed": false, "input": [ "X = vectorizer.fit_transform(books_df['contents'])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.cluster import KMeans\n", "\n", "number_of_clusters = 5\n", "km = KMeans(n_clusters=number_of_clusters)\n", "km.fit(X)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 49, "text": [ "KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=5, n_init=10,\n", " n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,\n", " verbose=0)" ] } ], "prompt_number": 49 }, { "cell_type": "code", "collapsed": false, "input": [ "print(\"Top terms per cluster:\")\n", "order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n", "terms = vectorizer.get_feature_names()\n", "for i in range(number_of_clusters):\n", " print(\"Cluster %d:\" % i),\n", " for ind in order_centroids[i, :10]:\n", " print(' %s' % terms[ind]),\n", " print ''" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Top terms per cluster:\n", "Cluster 0: lord hath upon god like man day hast thine wicked \n", "Cluster 1: christ god jesus lord things also us faith man hath \n", "Cluster 2: lord king said israel god upon people house son land \n", "Cluster 3: said jesus disciples man came saying god went peter lord \n", "Cluster 4: god man faith things us st good hath christ jesus \n" ] } ], "prompt_number": 50 }, { "cell_type": "code", "collapsed": false, "input": [ "additional_stopwords = ['shall', 'ye', 'thee', 'thou', 'thy', 'unto']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [ "import nltk\n", "\n", "english_stopwords = nltk.corpus.stopwords.words('english')\n", "new_stopwords = additional_stopwords + english_stopwords" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 44 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Congressional Record" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# You should already have the data, so you can skip pulling/unzipping it\n", "# Data is from http://www.cs.cornell.edu/home/llee/data/convote.html\n", "#!curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz\n", "#!tar -zxvf convote_v1.1.tar.gz" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 53 }, { "cell_type": "code", "collapsed": false, "input": [ "import re\n", "import glob\n", "\n", "paths = glob.glob(\"convote_v1.1/data_stage_one/development_set/*\")\n", "speeches = []\n", "for path in paths:\n", " speech = {}\n", " filename = path[-26:]\n", " speech['filename'] = filename\n", " speech['bill_no'] = filename[:3]\n", " speech['speaker_no'] = filename[4:10]\n", " speech['bill_vote'] = filename[-5]\n", " speech['party'] = filename[-7]\n", " \n", " # Open the file\n", " speech_file = open(path, 'r')\n", " # Read the stuff out of it\n", " speech['contents'] = speech_file.read()\n", "\n", " cleaned_contents = re.sub(r\"[^ \\w]\",'', speech['contents'])\n", " cleaned_contents = re.sub(r\" +\",' ', cleaned_contents)\n", " cleaned_contents = cleaned_contents.strip()\n", " words = cleaned_contents.split(' ')\n", " speech['word_count'] = len(words)\n", " \n", " speeches.append(speech)\n", "speeches[:5]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 54, "text": [ "[{'bill_no': '052',\n", " 'bill_vote': 'N',\n", " 'contents': \"mr. chairman , i thank the gentlewoman for yielding me this time . \\nmy good colleague from california raised the exact and critical point . \\nthe question is , what happens during those 45 days ? \\nwe will need to support elections . \\nthere is not a single member of this house who has not supported some form of general election , a special election , to replace the members at some point . \\nbut during that 45 days , what happens ? \\nthe chair of the constitution subcommittee says this is what happens : martial law . \\nwe do not know who would fill the vacancy of the presidency , but we do know that the succession act most likely suggests it would be an unelected person . \\nthe sponsors of the bill before us today insist , and i think rightfully so , on the importance of elections . \\nbut to then say that during a 45-day period we would have none of the checks and balances so fundamental to our constitution , none of the separation of powers , and that the presidency would be filled by an unelected member of the cabinet who not a single member of this country , not a single citizen , voted to fill that position , and that that person would have no checks and balances from congress for a period of 45 days i find extraordinary . \\ni find it inconsistent . \\ni find it illogical , and , frankly , i find it dangerous . \\nthe gentleman from wisconsin refused earlier to yield time , but i was going to ask him , if virginia has those elections in a shorter time period , they should be commended for that . \\nso now we have a situation in the congress where the virginia delegation has sent their members here , but many other states do not have members here . \\ndo they at that point elect a speaker of the house in the absence of other members ? \\nand then three more states elect their representatives , temporary replacements , or full replacements at that point . \\nthey come in . \\ndo they elect a new speaker ? \\nand if that happens , who becomes the president under the succession act ? \\nthis bill does not address that question . \\nthis bill responds to real threats with fantasies . \\nit responds with the fantasy , first of all , that a lot of people will still survive ; but we have no guarantee of that . \\nit responds with the fantasy that those who do survive will do the right thing . \\nwe are here having this debate , we have debates every day , because people differ on what the right thing is to do . \\ni have been in very traumatic situations with people in severe car wrecks and mountain climbing accidents . \\nmy experience has not been that crisis imbues universal sagacity and fairness . \\nit has not been that . \\npeople respond in extraordinary ways , and we must preserve an institution that has the deliberative body and the checks and balances to meet those challenges . \\nmany of our states are going increasingly to mail-in ballots . \\nwe in this body were effectively disabled by an anthrax attack not long after september 11 . \\ni would ask my dear friends , will you conduct this election in 45 days if there is anthrax in the mail and still preserve the franchise of the american people ? \\nhow will you do that ? \\nyou have no answer to that question . \\ni find it extraordinary , frankly , that while saying you do not want to amend the constitution , we began this very congress by amending the constitution through the rule , by undermining the principle that a quorum is 50 percent of the body and instead saying it is however many people survive . \\nand if that rule applies , who will designate it , who will implement it ? \\nthe speaker , or the speaker 's designee ? \\nagain , not an elected person , as you say is so critical and i believe is critical , but a temporary appointee , frankly , who not a single other member of this body knows who they are . \\nso we not only have an unelected person , we have an unknown person who will convene this body , and who , by the way , could conceivably convene it for their own election to then become the president of the united states under the succession act . \\nyou have refused steadfastly to debate this real issue broadly . \\nyou had a mock debate in the committee on the judiciary in which the distinguished chairman presented my bill without allowing me the courtesy or dignity to defend it myself . \\nand on that , you proudly say you defend democracy . \\nsir , i think you dissemble in that regard . \\nhere is the fundamental question for us , my friends , and it is this : the american people are watching television and an announcement comes on and says the congress has been destroyed in a nuclear attack , the president and vice president are killed and the supreme court is dead and thousands of our citizens in this town are . \\nwhat happens next ? \\nunder your bill , 45 days of chaos . \\napparently , according to the committee on the judiciary subcommittee on the constitution chairman , 45 days of marshal law , rule of this country by an unelected president with no checks and balances . \\nor an alternative , an alternative which says quite simply that the people have entrusted the representatives they send here to make profound decisions , war , taxation , a host of other things , and those representatives would have the power under the bill of the gentleman from california ( mr. rohrabacher ) xz4003430 bill or mine to designate temporary successors , temporary , only until we can have a real election . \\nthe american people , in one scenario , are told we do not know who is going to run the country , we have no representatives ; where in another you will have temporary representatives carrying your interests to this great body while we deliberate and have real elections . \\nthat is the choice . \\nyou are making the wrong choice today if you think you have solved this problem . \\n\",\n", " 'filename': '052_400011_0327014_DON.txt',\n", " 'party': 'D',\n", " 'speaker_no': '400011',\n", " 'word_count': 974},\n", " {'bill_no': '052',\n", " 'bill_vote': 'N',\n", " 'contents': \"mr. chairman , i want to thank my good friend from california ( mr. rohrabacher ) xz4003430 . \\ni will always remember that day , as we all will . \\nhis point is well taken . \\ni understand there is good intent behind the bill before us today and the amendment , but it is not enough . \\nit simply is not . \\nit leaves our country vulnerable for 45 days and that is too long . \\nthe distinguished chairman of the committee on the judiciary made some comments recently that suggested that somehow terrorists would oppose this bill and by some implication would favor the bill the gentleman from california ( mr. rohrabacher ) xz4003430 and i have put forward because it seems to support their autocratic views of government . \\nnothing could be further from the truth . \\nin fact , what our bill would do is tell the terrorists , you could come on a single day and set off a nuclear weapon in this town and kill every single member of us ; and though we would be missed , the very next day the congress would be up and functioning with every single state , every single district having full representation by statesmen and stateswomen at a time of national crisis . \\nthat is what the gentleman from california ( mr. rohrabacher ) xz4003430 and i are trying to do . \\nwe are trying to tell the terrorists , you can kill all of us as individuals , but you will not defeat this institution . \\nyou will not defeat the principle of representation . \\nyou will not defeat the principles of checks and balances . \\nyou will not impose martial law . \\nhere is the irony . \\nif terrorists hit us today when we finally vote on this , let us suppose a few democrats do not make it over here . \\nyou are leaving this country vulnerable to change in power . \\nif the terrorists were to strike your conference retreat where the president speaks to the republican house and senate members and kill hundreds of house and senate members on the republican side , the democrats at that point claim the majority . \\nthe democrats at that point elect a speaker of the house . \\ni am a democrat , for goodness sakes ; but that is not the way to leave our country vulnerable . \\nyou are leaving your own party , you are leaving the will of the people through their elections vulnerable . \\nif we have temporary replacements , you immediately reconstitute the house ; you immediately ensure representation ; you assure that you maintain the balance of political power ; and you do it in an orderly , structured way with no chaos , in a way that is constitutionally valid by definition . \\nwhat you have proposed is not necessarily constitutionally valid . \\nit leaves the terrorists able to change our system of government . \\nit depends on a fantasy immediate or quick election . \\nit does not allow really qualified people necessarily to get here and act in time . \\nthere are so many things you have left undone . \\nyou are going to try to say that at the start of this year we have solved this problem ; let us go home . \\nyou have not solved the problem , and it is a doggone disgrace , and it is a danger to this country . \\nthe other day a gentleman testified before the committee on the budget and said this : `` the lack of preparation for continuity , for true continuity invites attack. '' you are inviting attack . \\nnot preventing attack . \\n\",\n", " 'filename': '052_400011_0327025_DON.txt',\n", " 'party': 'D',\n", " 'speaker_no': '400011',\n", " 'word_count': 556},\n", " {'bill_no': '052',\n", " 'bill_vote': 'N',\n", " 'contents': 'mr. chairman , i rise to make two fundamental points before we proceed to vote on this . \\nthe two points are these : this resolution does not solve the real problem and it may create more problems than it purports to solve , and we have to understand that . \\nit does not solve the problem for this reason : by leaving us without a congress for 45 days , we essentially impose the opportunity for the executive branch to exert marshal law , and that is not what the framers of this country had in mind . \\nthis bill , if we do not provide some mechanism for prompt replacement other than this bill , will leave this country governed by an unelected executive , a cabinet member most likely who not a single american elected to that office . \\nfurthermore , it has a host of problems . \\nit does not address the possibility that one delegation will elect its representatives more promptly than another . \\nthey will come to this body , choose one of its members as speaker . \\nthat person could move on to become the president . \\nthen another delegation comes in , et cetera . \\nyou are essentially leaving this country without a house of representatives , without checks and balances , without separation of powers , for at least 45 days , assuming an election can be held in 45 days and assuming that the terrorists through an anthrax attack , like they subjected this very capitol to , will not somehow undermine that ability . \\nthis is reality . \\nwe have seen the reality here . \\nwe saw those airplanes hit the buildings , we saw the anthrax , and yet we are not truly acting to solve this . \\nmr. chairman , i yield to my distinguished friend , the gentleman from california ( mr. rohrabacher ) xz4003430 . \\n',\n", " 'filename': '052_400011_0327044_DON.txt',\n", " 'party': 'D',\n", " 'speaker_no': '400011',\n", " 'word_count': 282},\n", " {'bill_no': '052',\n", " 'bill_vote': 'N',\n", " 'contents': 'mr. chairman , reclaiming my time , let me make two final points : one , the majority party must understand this : if you are at a republican conference retreat and terrorists should strike you and kill the president and vice president and significant numbers of your side of the aisle , the democrats under your proposed law will obtain the majority , will elect a speaker of the house , and that person will then become the president of the united states of america . \\nyou are leaving this country vulnerable to that . \\nyou must not do it . \\nyou must not . \\nthis matter must be taken seriously . \\nit deserves full debate . \\nwhether it is the proposal of the gentleman from california ( mr. rohrabacher ) xz4003430 and mine or others , we should commit to having this full house seriously consider this . \\nif we do not and we are not fortunate , history will not look kindly upon the jeopardy in which we have left this great nation . \\nvote no on this bill and insist on true debate on true continuity of congress in a responsible way that protects the balance of power , assures real succession to the presidency , and , most importantly , assures that your constituents will have representation at a time when our nation may well go to nuclear war , institute a draft , appropriate trillions of dollars , suspend habeas corpus and impose marshal law . \\nyou do not want that . \\nbut if you stop at this bill , you leave this nation vulnerable . \\nmr. chairman , if there is no one to speak in opposition , i ask unanimous consent to withdraw my preferential motion . \\n',\n", " 'filename': '052_400011_0327046_DON.txt',\n", " 'party': 'D',\n", " 'speaker_no': '400011',\n", " 'word_count': 261},\n", " {'bill_no': '052',\n", " 'bill_vote': 'N',\n", " 'contents': \"mr. chairman , i thank my distinguished colleague , and i appreciate his leadership on this issue . \\nthe gentleman from california ( mr. rohrabacher ) xz4003430 spoke eloquently about the need for the rohrabacher/baird amendment ; and i would like to address it briefly , if i may . \\nmadison is quoted on this topic , but let me quote madison from federalist 47 . \\nhe said : `` the accumulation of all powers , legislative , executive , and judiciary in the same hands , whether of one , a few , or many , and whether hereditary , self-appointed , or elected , may justly be pronounced the very definition of tyranny. '' now , i would like , if i may , to ask my colleagues , before we pass this appropriations bill with legislative language in it alleging to maintain continuity , to maybe address a couple of questions , before my colleagues vote on this , and i will yield time . \\nnot for a filibuster , but just to address some questions . \\nhow will we , given madison 's concern , maintain checks and balances during the 49-day period until we have the special elections ? \\ni would be happy to yield 30 seconds to anyone who plans to vote for this bill to address that question . \\n\",\n", " 'filename': '052_400011_1479036_DON.txt',\n", " 'party': 'D',\n", " 'speaker_no': '400011',\n", " 'word_count': 189}]" ] } ], "prompt_number": 54 }, { "cell_type": "code", "collapsed": false, "input": [ "speeches_df = pd.DataFrame(speeches)\n", "speeches_df.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bill_nobill_votecontentsfilenamepartyspeaker_noword_count
0 052 N mr. chairman , i thank the gentlewoman for yie... 052_400011_0327014_DON.txt D 400011 974
1 052 N mr. chairman , i want to thank my good friend ... 052_400011_0327025_DON.txt D 400011 556
2 052 N mr. chairman , i rise to make two fundamental ... 052_400011_0327044_DON.txt D 400011 282
3 052 N mr. chairman , reclaiming my time , let me mak... 052_400011_0327046_DON.txt D 400011 261
4 052 N mr. chairman , i thank my distinguished collea... 052_400011_1479036_DON.txt D 400011 189
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 57, "text": [ " bill_no bill_vote contents \\\n", "0 052 N mr. chairman , i thank the gentlewoman for yie... \n", "1 052 N mr. chairman , i want to thank my good friend ... \n", "2 052 N mr. chairman , i rise to make two fundamental ... \n", "3 052 N mr. chairman , reclaiming my time , let me mak... \n", "4 052 N mr. chairman , i thank my distinguished collea... \n", "\n", " filename party speaker_no word_count \n", "0 052_400011_0327014_DON.txt D 400011 974 \n", "1 052_400011_0327025_DON.txt D 400011 556 \n", "2 052_400011_0327044_DON.txt D 400011 282 \n", "3 052_400011_0327046_DON.txt D 400011 261 \n", "4 052_400011_1479036_DON.txt D 400011 189 " ] } ], "prompt_number": 57 }, { "cell_type": "code", "collapsed": false, "input": [ "vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')\n", "longer_speeches = speeches_df[speeches_df[\"word_count\"] > 92]\n", "X = vectorizer.fit_transform(longer_speeches['contents'])\n", "\n", "number_of_clusters = 7\n", "km = KMeans(n_clusters=number_of_clusters)\n", "km.fit(X)\n", "\n", "print(\"Top terms per cluster:\")\n", "order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n", "terms = vectorizer.get_feature_names()\n", "for i in range(number_of_clusters):\n", " print(\"Cluster %d:\" % i),\n", " for ind in order_centroids[i, :8]:\n", " print(' %s' % terms[ind]),\n", " print ''" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Top terms per cluster:\n", "Cluster 0: start head children program school programs amendment parents \n", "Cluster 1: china trade speaker cafta chinese currency jobs madam \n", "Cluster 2: treatment blending water sewage epa policy wastewater amendment \n", "Cluster 3: religious faith organizations based start head civil rights \n", "Cluster 4: veterans care va billion health budget money war \n", "Cluster 5: mr amendment chairman time gentleman house horses speaker \n", "Cluster 6: frivolous lawsuits court rule courts sanctions 11 lawsuit \n" ] } ], "prompt_number": 74 }, { "cell_type": "code", "collapsed": false, "input": [ "longer_speeches[\"k-means label\"] = km.labels_\n", "longer_speeches.head()\n", "epa_speeches = longer_speeches[longer_speeches[\"k-means label\"] == 2]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 84 }, { "cell_type": "code", "collapsed": false, "input": [ "vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')\n", "X = vectorizer.fit_transform(epa_speeches['contents'])\n", "\n", "number_of_clusters = 5\n", "km = KMeans(n_clusters=number_of_clusters)\n", "km.fit(X)\n", "\n", "print(\"Top terms per cluster:\")\n", "order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n", "terms = vectorizer.get_feature_names()\n", "for i in range(number_of_clusters):\n", " print(\"Cluster %d:\" % i),\n", " for ind in order_centroids[i, :5]:\n", " print(' %s' % terms[ind]),\n", " print ''" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Top terms per cluster:\n", "Cluster 0: subcommittee chairman amendment water policy \n", "Cluster 1: sewage epa amendment treatment policy \n", "Cluster 2: going rule epa saying anytime \n", "Cluster 3: issue fact trying matter obviously \n", "Cluster 4: blending treatment use communities weather \n" ] } ], "prompt_number": 89 }, { "cell_type": "code", "collapsed": false, "input": [ "speeches_df[\"word_count\"].describe()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 61, "text": [ "count 702.000000\n", "mean 273.216524\n", "std 698.807057\n", "min 3.000000\n", "25% 17.000000\n", "50% 92.500000\n", "75% 368.750000\n", "max 15402.000000\n", "dtype: float64" ] } ], "prompt_number": 61 }, { "cell_type": "code", "collapsed": false, "input": [ "speeches_df[speeches_df[\"word_count\"] < 17][\"contents\"][:10]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 63, "text": [ "5 i yield to the gentleman from illinois . \\n\n", "7 i yield to the gentleman from illinois . \\n\n", "14 mr. chairman , i demand a recorded vote . \\n\n", "24 i am , mr. speaker , in its present form . \\n\n", "27 mr. speaker , i demand a recorded vote . \\n\n", "28 mr. chairman , i offer an amendment . \\n\n", "30 mr. chairman , how much time do i have remaini...\n", "32 mr. chairman , i demand a recorded vote . \\n\n", "37 mr. chairman , i yield back the balance of my ...\n", "39 mr. chairman , i yield 2 minutes to the gentle...\n", "Name: contents, dtype: object" ] } ], "prompt_number": 63 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Harry Potter Fanfiction" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!curl -O https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/hp.zip" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\r\n", " Dload Upload Total Spent Left Speed\r\n", "\r", " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 32 9226k 32 2952k 0 0 2454k 0 0:00:03 0:00:01 0:00:02 2452k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 95 9226k 95 8832k 0 0 3982k 0 0:00:02 0:00:02 --:--:-- 3981k" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", "100 9226k 100 9226k 0 0 4007k 0 0:00:02 0:00:02 --:--:-- 4006k\r\n" ] } ], "prompt_number": 94 }, { "cell_type": "code", "collapsed": false, "input": [ "!unzip -y hp.zip" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "UnZip 5.52 of 28 February 2005, by Info-ZIP. Maintained by C. Spieler. Send\r\n", "bug reports using http://www.info-zip.org/zip-bug.html; see README for details.\r\n", "\r\n", "Usage: unzip [-Z] [-opts[modifiers]] file[.zip] [list] [-x xlist] [-d exdir]\r\n", " Default action is to extract files in list, except those in xlist, to exdir;\r\n", " file[.zip] may be a wildcard. -Z => ZipInfo mode (\"unzip -Z\" for usage).\r\n", "\r\n", " -p extract files to pipe, no messages -l list files (short format)\r\n", " -f freshen existing files, create none -t test compressed archive data\r\n", " -u update files, create if necessary -z display archive comment\r\n", " -x exclude files that follow (in xlist) -d extract files into exdir\r\n", "\r\n", "modifiers: -q quiet mode (-qq => quieter)\r\n", " -n never overwrite existing files -a auto-convert any text files\r\n", " -o overwrite files WITHOUT prompting -aa treat ALL files as text\r\n", " -j junk paths (do not make directories) -v be verbose/print version info\r\n", " -C match filenames case-insensitively -L make (some) names lowercase\r\n", " -X restore UID/GID info -V retain VMS version numbers\r\n", " -K keep setuid/setgid/tacky permissions -M pipe through \"more\" pager\r\n", "Examples (see unzip.txt for more info):\r\n", " unzip data1 -x joe => extract all files except joe from zipfile data1.zip\r\n", " unzip -p foo | more => send contents of foo.zip via pipe into program more\r\n", " unzip -fo foo ReadMe => quietly replace existing ReadMe if archive file newer\r\n" ] } ], "prompt_number": 96 }, { "cell_type": "code", "collapsed": false, "input": [ "paths = glob.glob(\"hp/*\")\n", "\n", "fanfics = []\n", "for path in paths:\n", " contents = open(path).read()\n", " # For fear of unicode\n", " contents = contents.decode(\"ascii\",\"ignore\")\n", " contents = contents.lower()\n", " \n", " fanfic = {}\n", " fanfic['contents'] = contents\n", " filename = path[3:]\n", " fanfic['name'] = filename[:-4]\n", " fanfics.append(fanfic)\n", "\n", "fanfics_df = pd.DataFrame(fanfics)\n", "fanfics_df.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
contentsname
0 prologue: the missiondisclaimer: all character... 10001898
1 blackdisclaimer: i do not own harry potterauth... 10004131
2 chapter 1\"i'm pregnant.\"\"\"\"mum please say some... 10004927
3 author's note: hey, just so you know, this is ... 10007980
4 disclaimer: i do not own harry potter and frie... 10010343
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 103, "text": [ " contents name\n", "0 prologue: the missiondisclaimer: all character... 10001898\n", "1 blackdisclaimer: i do not own harry potterauth... 10004131\n", "2 chapter 1\"i'm pregnant.\"\"\"\"mum please say some... 10004927\n", "3 author's note: hey, just so you know, this is ... 10007980\n", "4 disclaimer: i do not own harry potter and frie... 10010343" ] } ], "prompt_number": 103 }, { "cell_type": "code", "collapsed": false, "input": [ "vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')\n", "X = vectorizer.fit_transform(fanfics_df['contents'])\n", "\n", "number_of_clusters = 2\n", "km = KMeans(n_clusters=number_of_clusters)\n", "km.fit(X)\n", "\n", "print(\"Top terms per cluster:\")\n", "order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n", "terms = vectorizer.get_feature_names()\n", "for i in range(number_of_clusters):\n", " print(\"Cluster %d:\" % i),\n", " for ind in order_centroids[i, :8]:\n", " print(' %s' % terms[ind]),\n", " print ''" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Top terms per cluster:\n", "Cluster 0: harry hermione draco said just ron like ginny \n", "Cluster 1: lily james sirius remus said harry just eyes \n" ] } ], "prompt_number": 107 }, { "cell_type": "code", "collapsed": false, "input": [ "fanfics_df[\"k-means labels\"] = km.labels_\n", "harrys_friends_df = fanfics_df[fanfics_df[\"k-means labels\"] == 0]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 108 }, { "cell_type": "code", "collapsed": false, "input": [ "vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')\n", "X = vectorizer.fit_transform(harrys_friends_df['contents'])\n", "\n", "number_of_clusters = 5\n", "km = KMeans(n_clusters=number_of_clusters)\n", "km.fit(X)\n", "\n", "print(\"Top terms per cluster:\")\n", "order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n", "terms = vectorizer.get_feature_names()\n", "for i in range(number_of_clusters):\n", " print(\"Cluster %d:\" % i),\n", " for ind in order_centroids[i, :8]:\n", " print(' %s' % terms[ind]),\n", " print ''" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Top terms per cluster:\n", "Cluster 0: draco hermione harry malfoy just said like granger \n", "Cluster 1: harry said hermione ron ginny dumbledore potter just \n", "Cluster 2: hermione ron harry ginny said just fred like \n", "Cluster 3: rose scorpius albus said al just malfoy hugo \n", "Cluster 4: said like just eyes severus know harry time \n" ] } ], "prompt_number": 109 }, { "cell_type": "code", "collapsed": false, "input": [ "harrys_friends_df[\"friends label\"] = km.labels_\n", "draco_herm_df = harrys_friends_df[harrys_friends_df[\"friends label\"] == 0]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 110 }, { "cell_type": "code", "collapsed": false, "input": [ "vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')\n", "X = vectorizer.fit_transform(draco_herm_df['contents'])\n", "\n", "number_of_clusters = 3\n", "km = KMeans(n_clusters=number_of_clusters)\n", "km.fit(X)\n", "\n", "print(\"Top terms per cluster:\")\n", "order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n", "terms = vectorizer.get_feature_names()\n", "for i in range(number_of_clusters):\n", " print(\"Cluster %d:\" % i),\n", " for ind in order_centroids[i, :8]:\n", " print(' %s' % terms[ind]),\n", " print ''" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Top terms per cluster:\n", "Cluster 0: draco blaise fred pansy george like theo know \n", "Cluster 1: draco hermione said just like granger didn malfoy \n", "Cluster 2: harry draco potter just malfoy like man eyes \n" ] } ], "prompt_number": 112 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Twilight" ] }, { "cell_type": "code", "collapsed": false, "input": [ "!curl -O https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/twilight.zip\n", "!unzip -o twilight.zip" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\r\n", " Dload Upload Total Spent Left Speed\r\n", "\r", " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", " 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r", "100 2831k 100 2831k 0 0 2784k 0 0:00:01 0:00:01 --:--:-- 2787k\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Archive: twilight.zip\r\n", " inflating: twilight/10016071.txt \r\n", " inflating: twilight/10016524.txt \r\n", " inflating: twilight/10019441.txt \r\n", " inflating: twilight/10021891.txt \r\n", " inflating: twilight/10029139.txt \r\n", " inflating: twilight/10029797.txt \r\n", " inflating: twilight/10030147.txt \r\n", " inflating: twilight/10042940.txt \r\n", " inflating: twilight/10046233.txt \r\n", " inflating: twilight/10047055.txt \r\n", " inflating: twilight/10050374.txt \r\n", " inflating: twilight/10050718.txt \r\n", " inflating: twilight/10058807.txt \r\n", " inflating: twilight/10060217.txt \r\n", " inflating: twilight/10060881.txt \r\n", " inflating: twilight/10061496.txt \r\n", " inflating: twilight/10064909.txt \r\n", " inflating: twilight/10078293.txt \r\n", " inflating: twilight/10079909.txt \r\n", " inflating: twilight/10082628.txt \r\n", " inflating: twilight/10086525.txt \r\n", " inflating: twilight/10086845.txt \r\n", " inflating: twilight/10087055.txt \r\n", " inflating: twilight/10088968.txt \r\n", " inflating: twilight/10090541.txt \r\n", " inflating: twilight/10103988.txt \r\n", " inflating: twilight/10109144.txt \r\n", " inflating: twilight/10109176.txt \r\n", " inflating: twilight/10109917.txt \r\n", " inflating: twilight/10120506.txt \r\n", " inflating: twilight/10130957.txt \r\n", " inflating: twilight/10134818.txt \r\n", " inflating: twilight/10135294.txt \r\n", " inflating: twilight/10140051.txt \r\n", " inflating: twilight/10141874.txt \r\n", " inflating: twilight/10145528.txt \r\n", " inflating: twilight/10150704.txt \r\n", " inflating: twilight/10152474.txt \r\n", " inflating: twilight/10156755.txt \r\n", " inflating: twilight/10161589.txt \r\n", " inflating: twilight/10170411.txt \r\n", " inflating: twilight/10170441.txt \r\n", " inflating: twilight/10171111.txt \r\n", " inflating: twilight/10177219.txt \r\n", " inflating: twilight/10180541.txt \r\n", " inflating: twilight/10187563.txt \r\n", " inflating: twilight/10189248.txt \r\n", " inflating: twilight/10189525.txt \r\n", " inflating: twilight/10191037.txt \r\n", " inflating: twilight/10191255.txt \r\n", " inflating: twilight/10191633.txt \r\n", " inflating: twilight/10192934.txt \r\n", " inflating: twilight/10194603.txt \r\n", " inflating: twilight/10195667.txt \r\n", " inflating: twilight/10196304.txt \r\n", " inflating: twilight/10199678.txt \r\n", " inflating: twilight/10200439.txt \r\n", " inflating: twilight/10204063.txt \r\n", " inflating: twilight/10204235.txt \r\n", " inflating: twilight/10204244.txt \r\n", " inflating: twilight/10207563.txt \r\n", " inflating: twilight/10208579.txt \r\n", " inflating: twilight/10212570.txt \r\n", " inflating: twilight/10216557.txt \r\n", " inflating: twilight/10216605.txt \r\n", " inflating: twilight/10222122.txt \r\n", " inflating: twilight/10232528.txt \r\n", " inflating: twilight/10235795.txt \r\n", " inflating: twilight/10241550.txt \r\n", " inflating: twilight/10246172.txt \r\n", " inflating: twilight/10246462.txt \r\n", " inflating: twilight/10248062.txt \r\n", " inflating: twilight/10250458.txt \r\n", " inflating: twilight/10251763.txt \r\n", " inflating: twilight/10251981.txt \r\n", " inflating: twilight/10258946.txt \r\n", " inflating: twilight/10262365.txt \r\n", " inflating: twilight/10262385.txt \r\n", " inflating: twilight/10262983.txt \r\n", " inflating: twilight/10268896.txt \r\n", " inflating: twilight/10269161.txt \r\n", " inflating: twilight/10273916.txt \r\n", " inflating: twilight/10280960.txt \r\n", " inflating: twilight/10281896.txt \r\n", " inflating: twilight/10284689.txt \r\n", " inflating: twilight/10284801.txt " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", " inflating: twilight/10286275.txt \r\n", " inflating: twilight/10294367.txt \r\n", " inflating: twilight/10295939.txt \r\n", " inflating: twilight/10297442.txt \r\n", " inflating: twilight/10298933.txt \r\n", " inflating: twilight/10301389.txt \r\n", " inflating: twilight/10302586.txt \r\n", " inflating: twilight/10307650.txt \r\n", " inflating: twilight/10312333.txt \r\n", " inflating: twilight/10314566.txt \r\n", " inflating: twilight/10323040.txt \r\n", " inflating: twilight/10323045.txt \r\n", " inflating: twilight/10325720.txt \r\n", " inflating: twilight/10327214.txt \r\n", " inflating: twilight/10328544.txt \r\n", " inflating: twilight/10333204.txt \r\n", " inflating: twilight/10334017.txt \r\n", " inflating: twilight/10335946.txt \r\n", " inflating: twilight/10336553.txt \r\n", " inflating: twilight/10336818.txt \r\n", " inflating: twilight/10337605.txt \r\n", " inflating: twilight/10339410.txt \r\n", " inflating: twilight/10343797.txt \r\n", " inflating: twilight/10344814.txt \r\n", " inflating: twilight/10345258.txt \r\n", " inflating: twilight/10348194.txt \r\n", " inflating: twilight/10352165.txt \r\n", " inflating: twilight/10357506.txt \r\n", " inflating: twilight/10358658.txt \r\n", " inflating: twilight/10362789.txt \r\n", " inflating: twilight/10363568.txt \r\n", " inflating: twilight/10365151.txt \r\n", " inflating: twilight/10370676.txt \r\n", " inflating: twilight/10370942.txt \r\n", " inflating: twilight/10371789.txt \r\n", " inflating: twilight/10372056.txt \r\n", " inflating: twilight/10374514.txt \r\n", " inflating: twilight/10376925.txt \r\n", " inflating: twilight/10379848.txt \r\n", " inflating: twilight/10381590.txt \r\n", " inflating: twilight/10387254.txt \r\n", " inflating: twilight/10391280.txt \r\n", " inflating: twilight/10395072.txt \r\n", " inflating: twilight/10395577.txt \r\n", " inflating: twilight/10396002.txt \r\n", " inflating: twilight/10397311.txt \r\n", " inflating: twilight/10397579.txt \r\n", " inflating: twilight/10398693.txt \r\n", " inflating: twilight/10403468.txt \r\n", " inflating: twilight/10404430.txt \r\n", " inflating: twilight/10407405.txt \r\n", " inflating: twilight/10412446.txt \r\n", " inflating: twilight/10414259.txt \r\n", " inflating: twilight/10415883.txt \r\n", " inflating: twilight/10417349.txt \r\n", " inflating: twilight/10418008.txt \r\n", " inflating: twilight/10418103.txt \r\n", " inflating: twilight/10425518.txt \r\n", " inflating: twilight/10426753.txt \r\n", " inflating: twilight/10427603.txt \r\n", " inflating: twilight/10428183.txt \r\n", " inflating: twilight/10430881.txt \r\n", " inflating: twilight/10432467.txt \r\n", " inflating: twilight/10433266.txt \r\n", " inflating: twilight/10433536.txt \r\n", " inflating: twilight/10434458.txt \r\n", " inflating: twilight/10434895.txt \r\n", " inflating: twilight/10438356.txt \r\n", " inflating: twilight/10438858.txt \r\n", " inflating: twilight/10446328.txt \r\n", " inflating: twilight/10451419.txt \r\n", " inflating: twilight/10453301.txt \r\n", " inflating: twilight/10453491.txt \r\n", " inflating: twilight/10453825.txt \r\n", " inflating: twilight/10457477.txt \r\n", " inflating: twilight/10457506.txt \r\n", " inflating: twilight/10462777.txt \r\n", " inflating: twilight/10472264.txt \r\n", " inflating: twilight/10473271.txt \r\n", " inflating: twilight/10473280.txt \r\n", " inflating: twilight/10474456.txt \r\n", " inflating: twilight/10474580.txt \r\n", " inflating: twilight/10474855.txt \r\n", " inflating: twilight/10475090.txt \r\n", " inflating: twilight/10477604.txt \r\n", " inflating: twilight/10477950.txt \r\n", " inflating: twilight/10478310.txt \r\n", " inflating: twilight/10481398.txt \r\n", " inflating: twilight/10481550.txt \r\n", " inflating: twilight/10483597.txt " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", " inflating: twilight/10485026.txt \r\n", " inflating: twilight/10487872.txt \r\n", " inflating: twilight/10488717.txt \r\n", " inflating: twilight/10490963.txt \r\n", " inflating: twilight/10491290.txt \r\n", " inflating: twilight/10491955.txt \r\n", " inflating: twilight/10491999.txt \r\n", " inflating: twilight/10492279.txt \r\n", " inflating: twilight/10493288.txt \r\n", " inflating: twilight/10493501.txt \r\n", " inflating: twilight/10494243.txt \r\n", " inflating: twilight/10496508.txt \r\n", " inflating: twilight/10500426.txt \r\n", " inflating: twilight/10502406.txt \r\n", " inflating: twilight/10504088.txt \r\n", " inflating: twilight/10505005.txt \r\n", " inflating: twilight/10509496.txt \r\n", " inflating: twilight/10509795.txt \r\n", " inflating: twilight/10510031.txt \r\n", " inflating: twilight/10510951.txt \r\n", " inflating: twilight/10511805.txt \r\n", " inflating: twilight/10513794.txt \r\n", " inflating: twilight/10514795.txt \r\n", " inflating: twilight/10517876.txt \r\n", " inflating: twilight/10520140.txt \r\n", " inflating: twilight/10521227.txt \r\n", " inflating: twilight/10523218.txt \r\n", " inflating: twilight/10524772.txt \r\n", " inflating: twilight/10526181.txt \r\n", " inflating: twilight/10527292.txt \r\n", " inflating: twilight/10527617.txt \r\n", " inflating: twilight/10528547.txt \r\n", " inflating: twilight/10528594.txt \r\n", " inflating: twilight/10529345.txt \r\n", " inflating: twilight/10531169.txt \r\n", " inflating: twilight/10533355.txt \r\n", " inflating: twilight/10534458.txt \r\n", " inflating: twilight/10534532.txt \r\n", " inflating: twilight/10535653.txt \r\n", " inflating: twilight/10536476.txt \r\n", " inflating: twilight/10537199.txt \r\n", " inflating: twilight/10537772.txt \r\n", " inflating: twilight/10541448.txt \r\n", " inflating: twilight/10541579.txt \r\n", " inflating: twilight/10543643.txt \r\n", " inflating: twilight/10546687.txt \r\n", " inflating: twilight/10547113.txt \r\n", " inflating: twilight/10547343.txt \r\n", " inflating: twilight/10547370.txt \r\n", " inflating: twilight/10547780.txt \r\n", " inflating: twilight/10548403.txt \r\n", " inflating: twilight/10548632.txt \r\n", " inflating: twilight/10550216.txt \r\n", " inflating: twilight/10551522.txt \r\n", " inflating: twilight/10552181.txt \r\n", " inflating: twilight/10553063.txt \r\n", " inflating: twilight/10554639.txt \r\n", " inflating: twilight/10555069.txt \r\n", " inflating: twilight/10555577.txt \r\n", " inflating: twilight/10555968.txt \r\n", " inflating: twilight/10556127.txt \r\n", " inflating: twilight/10556168.txt \r\n", " inflating: twilight/10558082.txt \r\n", " inflating: twilight/10558233.txt \r\n", " inflating: twilight/10558446.txt \r\n", " inflating: twilight/10558662.txt \r\n", " inflating: twilight/10559054.txt \r\n", " inflating: twilight/10559764.txt \r\n", " inflating: twilight/10560953.txt \r\n", " inflating: twilight/10562121.txt \r\n", " inflating: twilight/10562706.txt \r\n", " inflating: twilight/10562816.txt \r\n", " inflating: twilight/10563237.txt \r\n", " inflating: twilight/10563689.txt \r\n", " inflating: twilight/10564873.txt \r\n", " inflating: twilight/10565056.txt \r\n", " inflating: twilight/10567010.txt \r\n", " inflating: twilight/10567223.txt \r\n", " inflating: twilight/10567394.txt \r\n", " inflating: twilight/10568543.txt " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", " inflating: twilight/10569116.txt \r\n", " inflating: twilight/10569143.txt \r\n", " inflating: twilight/10569246.txt \r\n", " inflating: twilight/10571217.txt \r\n", " inflating: twilight/10572009.txt \r\n", " inflating: twilight/10573849.txt \r\n", " inflating: twilight/10574055.txt \r\n", " inflating: twilight/10575483.txt \r\n", " inflating: twilight/10575699.txt \r\n", " inflating: twilight/10575743.txt \r\n", " inflating: twilight/10576242.txt \r\n", " inflating: twilight/10577724.txt \r\n", " inflating: twilight/10578186.txt \r\n", " inflating: twilight/10579028.txt \r\n", " inflating: twilight/10579440.txt \r\n", " inflating: twilight/10579738.txt \r\n", " inflating: twilight/10581551.txt \r\n", " inflating: twilight/10582449.txt \r\n", " inflating: twilight/10583173.txt \r\n", " inflating: twilight/10583990.txt \r\n", " inflating: twilight/10584271.txt \r\n", " inflating: twilight/10584313.txt \r\n", " inflating: twilight/10585251.txt \r\n", " inflating: twilight/10585363.txt \r\n", " inflating: twilight/10585807.txt \r\n", " inflating: twilight/10585949.txt \r\n", " inflating: twilight/10586787.txt \r\n", " inflating: twilight/10586903.txt \r\n", " inflating: twilight/10587095.txt \r\n", " inflating: twilight/10587412.txt \r\n", " inflating: twilight/10587486.txt \r\n", " inflating: twilight/10588117.txt \r\n", " inflating: twilight/10590030.txt \r\n", " inflating: twilight/10590413.txt \r\n", " inflating: twilight/10590578.txt \r\n", " inflating: twilight/10591164.txt \r\n", " inflating: twilight/10591786.txt \r\n", " inflating: twilight/10592328.txt \r\n", " inflating: twilight/10594034.txt \r\n", " inflating: twilight/10594298.txt \r\n", " inflating: twilight/10594448.txt \r\n", " inflating: twilight/10594891.txt \r\n", " inflating: twilight/10595084.txt \r\n", " inflating: twilight/10595448.txt \r\n", " inflating: twilight/10595722.txt \r\n", " inflating: twilight/10596133.txt \r\n", " inflating: twilight/10596177.txt \r\n", " inflating: twilight/10596192.txt \r\n", " inflating: twilight/10596374.txt \r\n", " inflating: twilight/10597666.txt \r\n", " inflating: twilight/10598064.txt \r\n", " inflating: twilight/10598593.txt \r\n", " inflating: twilight/10598938.txt \r\n", " inflating: twilight/10598968.txt \r\n", " inflating: twilight/10599652.txt \r\n", " inflating: twilight/10599692.txt \r\n", " inflating: twilight/10599794.txt \r\n", " inflating: twilight/10600073.txt \r\n", " inflating: twilight/10600300.txt \r\n", " inflating: twilight/10600566.txt \r\n", " inflating: twilight/10600676.txt \r\n", " inflating: twilight/10600792.txt \r\n", " inflating: twilight/10600868.txt \r\n", " inflating: twilight/10601348.txt \r\n", " inflating: twilight/10601367.txt \r\n", " inflating: twilight/10601482.txt \r\n", " inflating: twilight/10601786.txt \r\n", " inflating: twilight/10602048.txt \r\n", " inflating: twilight/10602382.txt \r\n", " inflating: twilight/10602676.txt \r\n", " inflating: twilight/10602697.txt \r\n", " inflating: twilight/10602788.txt \r\n", " inflating: twilight/10602822.txt \r\n", " inflating: twilight/10603050.txt \r\n", " inflating: twilight/10603099.txt \r\n", " inflating: twilight/10603119.txt \r\n", " inflating: twilight/10603423.txt \r\n", " inflating: twilight/10603438.txt \r\n", " inflating: twilight/10603800.txt " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", " inflating: twilight/10603842.txt \r\n", " inflating: twilight/10603961.txt \r\n", " inflating: twilight/10604332.txt \r\n", " inflating: twilight/10604342.txt \r\n", " inflating: twilight/10604632.txt \r\n", " inflating: twilight/10604633.txt \r\n", " inflating: twilight/10604662.txt \r\n", " inflating: twilight/10604771.txt \r\n", " inflating: twilight/10604944.txt \r\n", " inflating: twilight/10605257.txt \r\n", " inflating: twilight/10605627.txt \r\n", " inflating: twilight/10605754.txt \r\n", " inflating: twilight/10606009.txt \r\n", " inflating: twilight/10606063.txt \r\n", " inflating: twilight/10606134.txt \r\n", " inflating: twilight/10606239.txt \r\n", " inflating: twilight/10606349.txt \r\n", " inflating: twilight/10606357.txt \r\n", " inflating: twilight/10606404.txt \r\n", " inflating: twilight/10606406.txt \r\n", " inflating: twilight/10606596.txt \r\n", " inflating: twilight/10606710.txt \r\n", " inflating: twilight/10606764.txt \r\n", " inflating: twilight/10606787.txt \r\n", " inflating: twilight/10606793.txt \r\n", " inflating: twilight/10606808.txt \r\n", " inflating: twilight/10607110.txt \r\n", " inflating: twilight/10607388.txt \r\n", " inflating: twilight/10607430.txt \r\n", " inflating: twilight/10607571.txt \r\n", " inflating: twilight/10607776.txt \r\n", " inflating: twilight/10607787.txt \r\n", " inflating: twilight/10607912.txt \r\n", " inflating: twilight/10607918.txt \r\n", " inflating: twilight/10607932.txt \r\n", " inflating: twilight/10607972.txt \r\n", " inflating: twilight/10608013.txt \r\n", " inflating: twilight/10608017.txt \r\n", " inflating: twilight/10608103.txt \r\n", " inflating: twilight/10608139.txt \r\n", " inflating: twilight/10608163.txt \r\n", " inflating: twilight/10608220.txt \r\n", " inflating: twilight/10608285.txt \r\n", " inflating: twilight/10608534.txt \r\n", " inflating: twilight/10608813.txt \r\n", " inflating: twilight/10608939.txt \r\n", " inflating: twilight/10609029.txt \r\n", " inflating: twilight/10609365.txt \r\n", " inflating: twilight/10609722.txt \r\n", " inflating: twilight/10609791.txt \r\n", " inflating: twilight/10609918.txt \r\n", " inflating: twilight/10610013.txt \r\n", " inflating: twilight/10610032.txt \r\n", " inflating: twilight/10610179.txt \r\n", " inflating: twilight/10610348.txt \r\n", " inflating: twilight/10610356.txt \r\n", " inflating: twilight/10610486.txt \r\n", " inflating: twilight/10610852.txt \r\n", " inflating: twilight/10610855.txt \r\n", " inflating: twilight/10610917.txt \r\n", " inflating: twilight/10611126.txt \r\n", " inflating: twilight/10611205.txt \r\n", " inflating: twilight/10611269.txt \r\n", " inflating: twilight/10611278.txt \r\n", " inflating: twilight/10611406.txt \r\n", " inflating: twilight/10611504.txt \r\n", " inflating: twilight/10611704.txt \r\n", " inflating: twilight/10611856.txt \r\n", " inflating: twilight/10611900.txt \r\n", " inflating: twilight/10612025.txt \r\n", " inflating: twilight/10612026.txt \r\n", " inflating: twilight/10612073.txt \r\n", " inflating: twilight/10612120.txt \r\n", " inflating: twilight/10612223.txt \r\n", " inflating: twilight/10612262.txt \r\n", " inflating: twilight/10612396.txt \r\n", " inflating: twilight/10612469.txt \r\n", " inflating: twilight/10612522.txt \r\n", " inflating: twilight/10612543.txt \r\n", " inflating: twilight/10612604.txt \r\n", " inflating: twilight/10612720.txt \r\n", " inflating: twilight/10612742.txt \r\n", " inflating: twilight/10612987.txt \r\n", " inflating: twilight/10612989.txt \r\n", " inflating: twilight/10613463.txt \r\n", " inflating: twilight/10613499.txt \r\n", " inflating: twilight/10613520.txt \r\n", " inflating: twilight/10613525.txt \r\n", " inflating: twilight/10613572.txt \r\n", " inflating: twilight/10613578.txt \r\n", " inflating: twilight/10613584.txt \r\n", " inflating: twilight/10613687.txt \r\n", " inflating: twilight/10613710.txt \r\n", " inflating: twilight/10613723.txt \r\n", " inflating: twilight/10613937.txt \r\n", " inflating: twilight/10614061.txt \r\n", " inflating: twilight/10614126.txt \r\n", " inflating: twilight/10614171.txt \r\n", " inflating: twilight/10614312.txt \r\n", " inflating: twilight/10614343.txt " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", " inflating: twilight/10614353.txt \r\n", " inflating: twilight/10614445.txt \r\n", " inflating: twilight/4869815.txt \r\n", " inflating: twilight/5018846.txt \r\n", " inflating: twilight/5065602.txt \r\n", " inflating: twilight/5107651.txt \r\n", " inflating: twilight/5222864.txt \r\n", " inflating: twilight/5512555.txt \r\n", " inflating: twilight/5539671.txt \r\n", " inflating: twilight/5590332.txt \r\n", " inflating: twilight/5723175.txt \r\n", " inflating: twilight/5741779.txt \r\n", " inflating: twilight/5763619.txt \r\n", " inflating: twilight/5832550.txt \r\n", " inflating: twilight/5897650.txt \r\n", " inflating: twilight/5982525.txt \r\n", " inflating: twilight/6027176.txt \r\n", " inflating: twilight/6643551.txt \r\n", " inflating: twilight/6663669.txt \r\n", " inflating: twilight/6669949.txt \r\n", " inflating: twilight/6686525.txt \r\n", " inflating: twilight/6710619.txt \r\n", " inflating: twilight/6834202.txt \r\n", " inflating: twilight/6909951.txt \r\n", " inflating: twilight/6953804.txt \r\n", " inflating: twilight/7007740.txt \r\n", " inflating: twilight/7049842.txt \r\n", " inflating: twilight/7072736.txt \r\n", " inflating: twilight/7111096.txt \r\n", " inflating: twilight/7147261.txt \r\n", " inflating: twilight/7180599.txt \r\n", " inflating: twilight/7215043.txt \r\n", " inflating: twilight/7241648.txt \r\n", " inflating: twilight/7312315.txt \r\n", " inflating: twilight/7451302.txt \r\n", " inflating: twilight/7461067.txt \r\n", " inflating: twilight/7470526.txt \r\n", " inflating: twilight/7482748.txt \r\n", " inflating: twilight/7514383.txt \r\n", " inflating: twilight/7544799.txt \r\n", " inflating: twilight/7585406.txt \r\n", " inflating: twilight/7657479.txt \r\n", " inflating: twilight/7710461.txt \r\n", " inflating: twilight/7807895.txt \r\n", " inflating: twilight/7819564.txt \r\n", " inflating: twilight/7850576.txt \r\n", " inflating: twilight/7868518.txt \r\n", " inflating: twilight/7907513.txt \r\n", " inflating: twilight/7931275.txt \r\n", " inflating: twilight/8005615.txt \r\n", " inflating: twilight/8023905.txt \r\n", " inflating: twilight/8051177.txt \r\n", " inflating: twilight/8062991.txt \r\n", " inflating: twilight/8177305.txt \r\n", " inflating: twilight/8187260.txt \r\n", " inflating: twilight/8234864.txt \r\n", " inflating: twilight/8259942.txt \r\n", " inflating: twilight/8282455.txt \r\n", " inflating: twilight/8312984.txt \r\n", " inflating: twilight/8317307.txt \r\n", " inflating: twilight/8348608.txt \r\n", " inflating: twilight/8353504.txt \r\n", " inflating: twilight/8360306.txt \r\n", " inflating: twilight/8361607.txt \r\n", " inflating: twilight/8384503.txt \r\n", " inflating: twilight/8419554.txt \r\n", " inflating: twilight/8444455.txt \r\n", " inflating: twilight/8509717.txt \r\n", " inflating: twilight/8616601.txt \r\n", " inflating: twilight/8666106.txt \r\n", " inflating: twilight/8676067.txt \r\n", " inflating: twilight/8693686.txt \r\n", " inflating: twilight/8706065.txt \r\n", " inflating: twilight/8727994.txt \r\n", " inflating: twilight/8728238.txt \r\n", " inflating: twilight/8768557.txt \r\n", " inflating: twilight/8837147.txt \r\n", " inflating: twilight/8842965.txt \r\n", " inflating: twilight/8853059.txt \r\n", " inflating: twilight/8856038.txt \r\n", " inflating: twilight/8856278.txt \r\n", " inflating: twilight/8865881.txt \r\n", " inflating: twilight/8895989.txt \r\n", " inflating: twilight/8897114.txt \r\n", " inflating: twilight/8925733.txt \r\n", " inflating: twilight/8933287.txt \r\n", " inflating: twilight/8962912.txt \r\n", " inflating: twilight/8981010.txt \r\n", " inflating: twilight/8981302.txt \r\n", " inflating: twilight/8996197.txt \r\n", " inflating: twilight/8997361.txt \r\n", " inflating: twilight/9038159.txt \r\n", " inflating: twilight/9048088.txt \r\n", " inflating: twilight/9053802.txt \r\n", " inflating: twilight/9097248.txt \r\n", " inflating: twilight/9119196.txt \r\n", " inflating: twilight/9141251.txt \r\n", " inflating: twilight/9145261.txt \r\n", " inflating: twilight/9148990.txt \r\n", " inflating: twilight/9174762.txt \r\n", " inflating: twilight/9176270.txt \r\n", " inflating: twilight/9187817.txt \r\n", " inflating: twilight/9235498.txt \r\n", " inflating: twilight/9245557.txt \r\n", " inflating: twilight/9258328.txt " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", " inflating: twilight/9260002.txt \r\n", " inflating: twilight/9291120.txt \r\n", " inflating: twilight/9307320.txt \r\n", " inflating: twilight/9307324.txt \r\n", " inflating: twilight/9320161.txt \r\n", " inflating: twilight/9324745.txt \r\n", " inflating: twilight/9350379.txt \r\n", " inflating: twilight/9363528.txt \r\n", " inflating: twilight/9364380.txt \r\n", " inflating: twilight/9366781.txt \r\n", " inflating: twilight/9372612.txt \r\n", " inflating: twilight/9376361.txt \r\n", " inflating: twilight/9382021.txt \r\n", " inflating: twilight/9384370.txt \r\n", " inflating: twilight/9388253.txt \r\n", " inflating: twilight/9449313.txt \r\n", " inflating: twilight/9451138.txt \r\n", " inflating: twilight/9457516.txt \r\n", " inflating: twilight/9460911.txt \r\n", " inflating: twilight/9463656.txt \r\n", " inflating: twilight/9477911.txt \r\n", " inflating: twilight/9497589.txt \r\n", " inflating: twilight/9507423.txt \r\n", " inflating: twilight/9519349.txt \r\n", " inflating: twilight/9542579.txt \r\n", " inflating: twilight/9544658.txt \r\n", " inflating: twilight/9547136.txt \r\n", " inflating: twilight/9564350.txt \r\n", " inflating: twilight/9570770.txt \r\n", " inflating: twilight/9573360.txt \r\n", " inflating: twilight/9584935.txt \r\n", " inflating: twilight/9599527.txt \r\n", " inflating: twilight/9602556.txt \r\n", " inflating: twilight/9613741.txt \r\n", " inflating: twilight/9621627.txt \r\n", " inflating: twilight/9632720.txt \r\n", " inflating: twilight/9643272.txt \r\n", " inflating: twilight/9643999.txt \r\n", " inflating: twilight/9669120.txt \r\n", " inflating: twilight/9670479.txt \r\n", " inflating: twilight/9684794.txt \r\n", " inflating: twilight/9691507.txt \r\n", " inflating: twilight/9691564.txt \r\n", " inflating: twilight/9698577.txt \r\n", " inflating: twilight/9712607.txt \r\n", " inflating: twilight/9713348.txt \r\n", " inflating: twilight/9717709.txt \r\n", " inflating: twilight/9724618.txt \r\n", " inflating: twilight/9730929.txt \r\n", " inflating: twilight/9731152.txt \r\n", " inflating: twilight/9735154.txt \r\n", " inflating: twilight/9741363.txt \r\n", " inflating: twilight/9745552.txt \r\n", " inflating: twilight/9752376.txt \r\n", " inflating: twilight/9759104.txt \r\n", " inflating: twilight/9761533.txt \r\n", " inflating: twilight/9764168.txt \r\n", " inflating: twilight/9775624.txt \r\n", " inflating: twilight/9775934.txt \r\n", " inflating: twilight/9779069.txt \r\n", " inflating: twilight/9785972.txt \r\n", " inflating: twilight/9788953.txt \r\n", " inflating: twilight/9797824.txt \r\n", " inflating: twilight/9821325.txt \r\n", " inflating: twilight/9821671.txt \r\n", " inflating: twilight/9844398.txt \r\n", " inflating: twilight/9858083.txt \r\n", " inflating: twilight/9862476.txt \r\n", " inflating: twilight/9879884.txt \r\n", " inflating: twilight/9881221.txt \r\n", " inflating: twilight/9899121.txt \r\n", " inflating: twilight/9919438.txt \r\n", " inflating: twilight/9944397.txt \r\n", " inflating: twilight/9946445.txt \r\n", " inflating: twilight/9952305.txt \r\n", " inflating: twilight/9956100.txt \r\n", " inflating: twilight/9979704.txt \r\n", " inflating: twilight/9984045.txt \r\n", " inflating: twilight/9989843.txt \r\n", " inflating: twilight/9989965.txt \r\n", " inflating: twilight/9990417.txt \r\n", " inflating: twilight/9990870.txt \r\n", " inflating: twilight/9993318.txt \r\n" ] } ], "prompt_number": 121 }, { "cell_type": "code", "collapsed": false, "input": [ "!ls twilight" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "!cat twilight/10016071.txt " ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "paths = glob.glob(\"twilight/*\")\n", "\n", "fanfics = []\n", "for path in paths:\n", " contents = open(path).read()\n", " # For fear of unicode\n", " contents = contents.decode(\"ascii\",\"ignore\")\n", " contents = contents.lower()\n", " \n", " fanfic = {}\n", " fanfic['contents'] = contents\n", " filename = path[3:]\n", " fanfic['name'] = filename[:-4]\n", " fanfics.append(fanfic)\n", "\n", "fanfics_df = pd.DataFrame(fanfics)\n", "fanfics_df.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
contentsname
0 what follows is the original one-shot-no edits... light/10016071
1 a/n: hey guys this is my new story hope you e... light/10016524
2 i saw him. him and bella. but who cares about ... light/10019441
3 disclaimer: all publicly recognizable characte... light/10021891
4 thou art a heartless monsterdisclaimer: all re... light/10029139
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 122, "text": [ " contents name\n", "0 what follows is the original one-shot-no edits... light/10016071\n", "1 a/n: hey guys this is my new story hope you e... light/10016524\n", "2 i saw him. him and bella. but who cares about ... light/10019441\n", "3 disclaimer: all publicly recognizable characte... light/10021891\n", "4 thou art a heartless monsterdisclaimer: all re... light/10029139" ] } ], "prompt_number": 122 }, { "cell_type": "code", "collapsed": false, "input": [ "vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')\n", "X = vectorizer.fit_transform(fanfics_df['contents'])\n", "\n", "number_of_clusters = 5\n", "km = KMeans(n_clusters=number_of_clusters)\n", "km.fit(X)\n", "\n", "print(\"Top terms per cluster:\")\n", "order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n", "terms = vectorizer.get_feature_names()\n", "for i in range(number_of_clusters):\n", " print(\"Cluster %d:\" % i),\n", " for ind in order_centroids[i, :8]:\n", " print(' %s' % terms[ind]),\n", " print ''" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Top terms per cluster:\n", "Cluster 0: just like said eyes didn know looked bella \n", "Cluster 1: story like just know love bella edward don \n", "Cluster 2: edward alice bella jasper said carlisle just emmett \n", "Cluster 3: jacob bella edward jake seth billy like charlie \n", "Cluster 4: sam leah emily paul says just like embry \n" ] } ], "prompt_number": 124 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Tweets" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/tweets.csv.zip\n", "# https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/tweets-ukraine.csv.zip" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }