{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## words used in bike thefts\n", "# newspaper article scraped \n", "\n", "import pandas as pd\n", "import glob, os " ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import collections\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# not a CSV file but this seems to work\n", "data = pd.read_csv('../data/stories.txt', header = None)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
0Also on Stolen Bikes UK Snow — Congrats ...
\n", "
" ], "text/plain": [ " 0\n", "0 Also on Stolen Bikes UK Snow — Congrats ..." ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# seems to be about thefts\n", "data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#select only alpha, convert to lowercase\n", "import re\n", "letters_only = re.sub('[^a-zA-Z]', ' ', data[0][0])\n", "letters_only = letters_only.lower()\n", "letters_only = re.sub('\\s+', ' ', letters_only)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import collections\n", "#split at whitespace\n", "words = letters_only.split()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1499" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(words)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1499" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "wordarray=np.asarray(words)\n", "len(wordarray)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "bagsofwords = [ collections.Counter(re.findall(r'\\w+', txt))\n", " for txt in wordarray]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Counter({'a': 36,\n", " 'able': 1,\n", " 'about': 7,\n", " 'acceptable': 1,\n", " 'action': 1,\n", " 'activity': 1,\n", " 'actually': 1,\n", " 'add': 1,\n", " 'address': 2,\n", " 'advert': 1,\n", " 'africa': 1,\n", " 'after': 3,\n", " 'again': 4,\n", " 'ago': 4,\n", " 'all': 8,\n", " 'allowed': 1,\n", " 'almost': 1,\n", " 'also': 2,\n", " 'although': 1,\n", " 'am': 1,\n", " 'america': 1,\n", " 'an': 4,\n", " 'and': 44,\n", " 'angry': 1,\n", " 'annoyed': 1,\n", " 'another': 1,\n", " 'anti': 1,\n", " 'any': 3,\n", " 'anyone': 1,\n", " 'anything': 1,\n", " 'anyway': 1,\n", " 'appear': 1,\n", " 'approached': 1,\n", " 'area': 1,\n", " 'arm': 1,\n", " 'around': 1,\n", " 'arrange': 1,\n", " 'arrest': 1,\n", " 'arrested': 1,\n", " 'arrests': 1,\n", " 'as': 7,\n", " 'asked': 1,\n", " 'assigned': 1,\n", " 'assured': 1,\n", " 'at': 9,\n", " 'attended': 1,\n", " 'attention': 1,\n", " 'auction': 1,\n", " 'available': 1,\n", " 'avatar': 3,\n", " 'aware': 1,\n", " 'away': 5,\n", " 'back': 2,\n", " 'bad': 1,\n", " 'badgering': 1,\n", " 'baffled': 1,\n", " 'be': 7,\n", " 'bebo': 1,\n", " 'became': 1,\n", " 'because': 3,\n", " 'become': 1,\n", " 'been': 5,\n", " 'before': 1,\n", " 'behold': 1,\n", " 'being': 1,\n", " 'best': 1,\n", " 'bid': 1,\n", " 'bike': 24,\n", " 'bikes': 7,\n", " 'bit': 1,\n", " 'blog': 1,\n", " 'bombshell': 1,\n", " 'bought': 1,\n", " 'bounces': 1,\n", " 'buiilding': 1,\n", " 'built': 4,\n", " 'burn': 1,\n", " 'but': 4,\n", " 'by': 4,\n", " 'call': 2,\n", " 'came': 2,\n", " 'camera': 1,\n", " 'can': 3,\n", " 'carpark': 1,\n", " 'case': 1,\n", " 'cctv': 5,\n", " 'centre': 2,\n", " 'check': 2,\n", " 'cheeky': 1,\n", " 'co': 1,\n", " 'collect': 1,\n", " 'com': 1,\n", " 'come': 2,\n", " 'comforting': 1,\n", " 'comment': 1,\n", " 'commenting': 1,\n", " 'comments': 2,\n", " 'community': 1,\n", " 'component': 2,\n", " 'comprehensive': 1,\n", " 'confirmed': 2,\n", " 'congrats': 1,\n", " 'considered': 1,\n", " 'contact': 2,\n", " 'contacted': 4,\n", " 'convince': 1,\n", " 'cop': 1,\n", " 'could': 1,\n", " 'couple': 2,\n", " 'cove': 1,\n", " 'covered': 1,\n", " 'crime': 2,\n", " 'criminal': 1,\n", " 'custom': 2,\n", " 'customers': 1,\n", " 'cyclist': 1,\n", " 'd': 6,\n", " 'dating': 1,\n", " 'day': 9,\n", " 'dealer': 1,\n", " 'deciding': 1,\n", " 'defect': 1,\n", " 'denies': 1,\n", " 'described': 1,\n", " 'description': 2,\n", " 'details': 2,\n", " 'detonate': 1,\n", " 'devastating': 1,\n", " 'device': 1,\n", " 'did': 2,\n", " 'didn': 1,\n", " 'dire': 1,\n", " 'discovered': 1,\n", " 'disqus': 2,\n", " 'do': 3,\n", " 'does': 2,\n", " 'don': 3,\n", " 'done': 1,\n", " 'donkey': 1,\n", " 'down': 2,\n", " 'dream': 1,\n", " 'dropped': 1,\n", " 'drug': 1,\n", " 'drugs': 2,\n", " 'due': 1,\n", " 'each': 1,\n", " 'earlier': 1,\n", " 'ebay': 5,\n", " 'end': 1,\n", " 'entrapment': 1,\n", " 'erm': 1,\n", " 'estate': 1,\n", " 'even': 3,\n", " 'ever': 2,\n", " 'every': 6,\n", " 'exactly': 1,\n", " 'except': 1,\n", " 'expensive': 1,\n", " 'experienced': 1,\n", " 'explain': 1,\n", " 'explained': 1,\n", " 'extreme': 1,\n", " 'eye': 1,\n", " 'factor': 1,\n", " 'fail': 1,\n", " 'familiar': 1,\n", " 'feels': 1,\n", " 'figure': 1,\n", " 'finding': 1,\n", " 'first': 2,\n", " 'flat': 1,\n", " 'flip': 1,\n", " 'fobbed': 1,\n", " 'footage': 1,\n", " 'for': 10,\n", " 'force': 2,\n", " 'forward': 2,\n", " 'found': 4,\n", " 'frame': 2,\n", " 'friday': 1,\n", " 'from': 11,\n", " 'front': 1,\n", " 'full': 1,\n", " 'gave': 2,\n", " 'get': 2,\n", " 'gets': 1,\n", " 'git': 1,\n", " 'gloucester': 2,\n", " 'good': 2,\n", " 'got': 4,\n", " 'great': 3,\n", " 'gumtree': 1,\n", " 'guy': 4,\n", " 'guys': 2,\n", " 'had': 13,\n", " 'half': 2,\n", " 'happened': 2,\n", " 'happening': 1,\n", " 'happens': 1,\n", " 'happy': 2,\n", " 'hated': 1,\n", " 'have': 5,\n", " 'he': 17,\n", " 'heard': 2,\n", " 'helpful': 1,\n", " 'here': 1,\n", " 'hi': 1,\n", " 'him': 4,\n", " 'his': 5,\n", " 'home': 1,\n", " 'hopefully': 1,\n", " 'hopes': 1,\n", " 'hours': 2,\n", " 'house': 1,\n", " 'how': 3,\n", " 'however': 1,\n", " 'i': 51,\n", " 'identifying': 1,\n", " 'if': 1,\n", " 'immediately': 3,\n", " 'importance': 1,\n", " 'in': 21,\n", " 'incidents': 1,\n", " 'incredulity': 1,\n", " 'industry': 1,\n", " 'information': 1,\n", " 'instance': 1,\n", " 'into': 1,\n", " 'involved': 1,\n", " 'is': 13,\n", " 'issue': 1,\n", " 'it': 33,\n", " 'job': 1,\n", " 'johnstown': 1,\n", " 'just': 3,\n", " 'keeping': 1,\n", " 'kid': 3,\n", " 'kind': 1,\n", " 'knew': 5,\n", " 'know': 2,\n", " 'known': 1,\n", " 'lad': 2,\n", " 'last': 3,\n", " 'later': 4,\n", " 'left': 1,\n", " 'less': 1,\n", " 'like': 3,\n", " 'line': 1,\n", " 'list': 1,\n", " 'living': 1,\n", " 'loaded': 1,\n", " 'locked': 1,\n", " 'london': 2,\n", " 'long': 1,\n", " 'look': 1,\n", " 'looking': 1,\n", " 'low': 2,\n", " 'luckily': 1,\n", " 'm': 6,\n", " 'mac': 1,\n", " 'made': 1,\n", " 'make': 5,\n", " 'managed': 1,\n", " 'many': 2,\n", " 'marks': 1,\n", " 'me': 12,\n", " 'meet': 1,\n", " 'meeting': 1,\n", " 'might': 1,\n", " 'mile': 1,\n", " 'miles': 1,\n", " 'mine': 2,\n", " 'minutes': 2,\n", " 'months': 2,\n", " 'more': 1,\n", " 'morning': 1,\n", " 'most': 2,\n", " 'movements': 1,\n", " 'mugged': 1,\n", " 'my': 24,\n", " 'myself': 2,\n", " 'name': 3,\n", " 'needed': 1,\n", " 'neighbours': 1,\n", " 'never': 2,\n", " 'next': 1,\n", " 'night': 1,\n", " 'no': 4,\n", " 'none': 1,\n", " 'not': 7,\n", " 'nothing': 2,\n", " 'noticed': 1,\n", " 'notified': 1,\n", " 'now': 2,\n", " 'number': 3,\n", " 'observed': 1,\n", " 'obviously': 1,\n", " 'of': 18,\n", " 'off': 5,\n", " 'officer': 3,\n", " 'on': 17,\n", " 'one': 6,\n", " 'only': 2,\n", " 'opposite': 1,\n", " 'original': 1,\n", " 'other': 1,\n", " 'our': 5,\n", " 'out': 4,\n", " 'outcome': 1,\n", " 'outside': 1,\n", " 'over': 1,\n", " 'own': 5,\n", " 'owner': 2,\n", " 'page': 1,\n", " 'paid': 1,\n", " 'pals': 1,\n", " 'past': 1,\n", " 'people': 1,\n", " 'per': 1,\n", " 'person': 3,\n", " 'philip': 1,\n", " 'phone': 4,\n", " 'phoned': 4,\n", " 'pissed': 1,\n", " 'places': 1,\n", " 'plain': 1,\n", " 'pm': 1,\n", " 'point': 1,\n", " 'police': 22,\n", " 'policeman': 1,\n", " 'post': 1,\n", " 'powered': 1,\n", " 'poxy': 1,\n", " 'preloved': 1,\n", " 'previious': 1,\n", " 'priority': 1,\n", " 'privacy': 1,\n", " 'prove': 1,\n", " 'push': 1,\n", " 'put': 1,\n", " 'putting': 1,\n", " 'question': 1,\n", " 'quick': 1,\n", " 'rack': 1,\n", " 'rang': 2,\n", " 'rant': 1,\n", " 'really': 2,\n", " 'reason': 1,\n", " 'recently': 1,\n", " 'reclaimed': 1,\n", " 'reeled': 1,\n", " 'reference': 1,\n", " 'remotely': 1,\n", " 'repossessing': 1,\n", " 'researching': 1,\n", " 'respond': 1,\n", " 'responding': 1,\n", " 'response': 3,\n", " 'riding': 2,\n", " 'right': 2,\n", " 'risk': 1,\n", " 'road': 1,\n", " 'robbery': 1,\n", " 'running': 1,\n", " 's': 11,\n", " 'sadly': 1,\n", " 'safe': 1,\n", " 'said': 2,\n", " 'sale': 3,\n", " 'sales': 1,\n", " 'same': 2,\n", " 'sanchez': 1,\n", " 'saw': 3,\n", " 'say': 1,\n", " 'saying': 1,\n", " 'scroats': 1,\n", " 'second': 1,\n", " 'seconds': 2,\n", " 'securing': 1,\n", " 'see': 1,\n", " 'sell': 2,\n", " 'selling': 1,\n", " 'sete': 1,\n", " 'shocking': 1,\n", " 'side': 1,\n", " 'sight': 1,\n", " 'simple': 2,\n", " 'since': 2,\n", " 'sister': 1,\n", " 'site': 2,\n", " 'sits': 1,\n", " 'situation': 2,\n", " 'skatepark': 2,\n", " 'snow': 1,\n", " 'so': 9,\n", " 'sold': 1,\n", " 'some': 3,\n", " 'someone': 1,\n", " 'soon': 1,\n", " 'sorted': 1,\n", " 'sound': 2,\n", " 'sounds': 1,\n", " 'south': 1,\n", " 'spent': 2,\n", " 'spoke': 1,\n", " 'spotted': 2,\n", " 'stand': 2,\n", " 'started': 1,\n", " 'stating': 1,\n", " 'station': 1,\n", " 'statistics': 1,\n", " 'steal': 1,\n", " 'stealing': 1,\n", " 'still': 1,\n", " 'stolen': 9,\n", " 'stop': 1,\n", " 'stories': 2,\n", " 'story': 2,\n", " 'subscribe': 1,\n", " 'subsequently': 1,\n", " 'sure': 2,\n", " 'surprise': 1,\n", " 'surprises': 1,\n", " 'surprising': 1,\n", " 'suspension': 1,\n", " 't': 6,\n", " 'take': 1,\n", " 'taken': 2,\n", " 'technology': 1,\n", " 'tell': 2,\n", " 'terms': 1,\n", " 'text': 1,\n", " 'than': 1,\n", " 'that': 12,\n", " 'the': 90,\n", " 'theft': 1,\n", " 'theif': 1,\n", " 'their': 1,\n", " 'them': 4,\n", " 'then': 2,\n", " 'there': 9,\n", " 'these': 1,\n", " 'they': 11,\n", " 'thief': 2,\n", " 'thieves': 1,\n", " 'thing': 3,\n", " 'things': 2,\n", " 'this': 20,\n", " 'thoroughly': 1,\n", " 'those': 1,\n", " 'thought': 1,\n", " 'three': 1,\n", " 'tie': 1,\n", " 'time': 2,\n", " 'to': 45,\n", " 'told': 4,\n", " 'too': 1,\n", " 'took': 1,\n", " 'tops': 1,\n", " 'touch': 1,\n", " 'town': 2,\n", " 'trace': 1,\n", " 'traceable': 1,\n", " 'tracking': 1,\n", " 'trawling': 1,\n", " 'tries': 1,\n", " 'trying': 4,\n", " 'tuesday': 1,\n", " 'turn': 2,\n", " 'turns': 1,\n", " 'two': 2,\n", " 'uk': 2,\n", " 'uncertain': 1,\n", " 'under': 1,\n", " 'underground': 1,\n", " 'understood': 1,\n", " 'units': 1,\n", " 'unofficial': 1,\n", " 'unthinkable': 1,\n", " 'up': 6,\n", " 'usual': 1,\n", " 'utterly': 1,\n", " 'van': 1,\n", " 've': 2,\n", " 'vendor': 3,\n", " 'venue': 1,\n", " 'very': 2,\n", " 'victim': 1,\n", " 'wages': 1,\n", " 'waited': 1,\n", " 'waiting': 1,\n", " 'want': 2,\n", " 'was': 34,\n", " 'wasn': 1,\n", " 'watched': 1,\n", " 'way': 3,\n", " 'we': 3,\n", " 'week': 1,\n", " 'weeks': 2,\n", " 'well': 3,\n", " 'were': 1,\n", " 'what': 7,\n", " 'wheels': 1,\n", " 'when': 6,\n", " 'where': 2,\n", " 'which': 4,\n", " 'while': 2,\n", " 'whilst': 2,\n", " 'who': 5,\n", " 'whole': 2,\n", " 'why': 1,\n", " 'will': 1,\n", " 'with': 12,\n", " 'without': 1,\n", " 'won': 1,\n", " 'word': 1,\n", " 'words': 1,\n", " 'work': 5,\n", " 'works': 1,\n", " 'worst': 1,\n", " 'would': 6,\n", " 'wouldn': 1,\n", " 'wow': 1,\n", " 'wrexham': 1,\n", " 'year': 1,\n", " 'years': 4,\n", " 'you': 6,\n", " 'your': 5,\n", " 'yours': 1})" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sumbags = sum(bagsofwords, collections.Counter())\n", "sumbags" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "ename": "NameError", "evalue": "name 'stopwords' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mstops\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstopwords\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'english'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mmeaningful_words\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mword\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mword\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mwordarray\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mword\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstops\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mwords1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\" \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmeaningful_words\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'stopwords' is not defined" ] } ], "source": [ "import nltk\n", "#TODO - remove stopwords\n", "#stops = set(stopwords.words('english'))\n", "#meaningful_words = [word for word in wordarray if not word in stops]\n", "#words1 = \" \".join(meaningful_words)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [Root]", "language": "python", "name": "Python [Root]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }