{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "from dateutil import parser\n", "from sklearn.preprocessing import LabelEncoder\n", "from time import time\n", "import numpy as np\n", "import scipy as sp\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.decomposition import TruncatedSVD\n", "from sklearn.svm import SVC\n", "from sklearn.linear_model import SGDClassifier\n", "from sklearn.utils import shuffle\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.grid_search import GridSearchCV\n", "from sklearn.cross_validation import cross_val_score\n", "from sklearn.ensemble import ExtraTreesClassifier\n", "import re\n", "from HTMLParser import HTMLParser\n", "from Stemmer import Stemmer" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "tic = time()\n", "data = pd.read_csv('data/sentiment.csv', \n", " names=['polarity', 'id', 'date', 'query', 'user', 'text'])\n", "print 'loading data using', time() - tic\n", "\"\"\"\n", "tic = time()\n", "data['date'] = map(parser.parse, data['date'])\n", "print 'parsing date using', time() - tic\n", "\"\"\"\n", "tic = time()\n", "labeller = LabelEncoder()\n", "data['polarity'] = labeller.fit_transform(data['polarity'])\n", "print 'labelling polarity using', time() - tic" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "loading data using 4.87800216675\n", "labelling polarity using" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 0.187450885773\n" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "texts = np.array(data.text)\n", "labels = np.array(data.polarity)\n", "print texts.shape, labels.shape" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(1600000,) (1600000,)\n" ] } ], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "texts, labels = shuffle(texts, labels)\n", "texts, labels = texts[:100000], labels[:100000]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 31 }, { "cell_type": "code", "collapsed": false, "input": [ "## customized tokenizer, dont care about url, but need to capture \n", "## smiley faces\n", "def tokenizer(doc):\n", " ## remove url and user tag\n", " url_pat = re.compile(r'http(s?)://[\\w./]+')\n", " user_pat = re.compile(r'@\\w+')\n", " doc = url_pat.sub('', doc)\n", " doc = user_pat.sub('', doc)\n", " ## unescapge HTML\n", " h = HTMLParser()\n", " doc = h.unescape(doc)\n", " ## smiley faces\n", " smiley_faces = {\n", " ' HAPPY ': [':-)', ':)', ':o)', ':]', ':3', ':c)',':>', \n", " '=]', '8)', '=)', ':}', ':^)', ':-))',\n", " ':-D', ':D', '8-D', '8D', 'x-D', 'xD' 'X-D', \n", " 'XD', '=-D', '=D', '=-3', '=3', 'B^D', ]\n", " , ' SAD ': ['>:[', ':-(', ':(', ':-c', ':c', ':-<', ':<', ':-[', \n", " ':[', ':{', ';(']\n", " , ' ANGERY ': [':-||', ':@', '>:(']\n", " , ' CRYING ': [\":'-(\", \":'(\"]\n", " , ' HAPPY_TEARS ': \":'-) :')\".split()\n", " , ' DISGUST ': \"D:< D: D8 D; D= DX v.v D-':\".split()\n", " , ' SURPRISE ': \">:O :-O :O 8-0\".split()\n", " , ' KISS ': \":* :^* ( '}{' )\".split()\n", " , ' WINK ': \";-) ;) *-) *) ;-] ;] ;D ;^) :-,\".split()\n", " , ' CHEEKY ': \">:P :-P :P X-P x-p xp XP :-p :p =p :-b :b\".split()\n", " , ' ANOYED ': \">:\\ >:/ :-/ :-. :/ :\\ =/ =\\ :L =L :S >.<\".split()\n", " , ' HEART ': ['<3']\n", " , ' BROKEN_HEART ': ['\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mimportant_words\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmiscores\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0margsort\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1001\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[0mimportant_words\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[1;32mprint\u001b[0m \u001b[0mfeature_names\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mimportant_words\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mIndexError\u001b[0m: index 7811 is out of bounds for size 7811" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "[1350 1875 4022 2594 6706 4644 6309 2590 1270 5072 1680 1269 7256 2577 2576\n", " 3557 991 4019 7316 5766 3998 5779 4594 1445 2281 6672 5464 2922 6682 5466\n", " 3596 2612 2159 2916 1513 5939 4935 2317 5712 6769 7197 1089 1526 98 5325\n", " 3534 4097 3490 7188 1908 4910 3473 4083 5505 6258 4072 5716 7208 5717 5718\n", " 5091 2977 5949 6281 517 6751 4688 3528 3533 3995 7569 498 1300 5386 4462\n", " 7496 5894 2220 2775 5418 2747 6554 6420 5420 5899 2680 937 4508 1469 7322\n", " 1769 4492 7451 656 3808 7464 5025 7441 6451 6459 2708 6457 5849 3847 6454\n", " 175 4444 3915 4959 2256 4972 1458 1834 2647 584 7559 5906 2640 1842 5363\n", " 7328 2866 6647 3977 3677 697 3956 2660 1724 7361 6598 4434 934 7371 4547\n", " 4546 3927 6577 241 1821 758 5391 1013 361 4310 5564 3159 5563 4313 5561\n", " 3154 1066 6127 1086 6026 7746 1556 4204 2458 7075 4772 4201 2047 3337 1196\n", " 1246 5656 4756 5132 6021 6890 3349 4755 1237 6059 6106 6083 842 6065 3228\n", " 408 3224 1222 398 6054 5590 6074 4285 1360 1981 4289 6943 2421 5195 416\n", " 6089 5582 5190 4807 2430 4804 1232 7774 4795 4793 6013 4214 7697 5530 894\n", " 3393 1404 2505 5297 3446 352 7150 7148 4707 4705 6812 2487 4741 4352 360\n", " 1638 4898 3413 6838 5538 4344 1014 3077 4851 3719 5598 1466 2019 605 6564\n", " 5580 5615 4913 2788 4809 3277 4539 3740 3751 1811 7389 2118 7392 3760 1051\n", " 5403 3801 3257 4500 5857 3804 4832 6999 1128 5600 3466 7181 4841 1996 5030\n", " 7186 3037 1297 3456 6782 505 4912 6510 3049 5002 3214 5842 4515 7420 3764\n", " 3193 5415 3414 3355 6579 266 5765 1944 4653 3599 3600 6909 6722 4931 5478\n", " 3381 2173 450 3110 572 4770 2906 4758 7086 5477 7280 6874 7277 4623 4742\n", " 2151 2055 2934 3560 468 5664 3352 4678 3508 7207 2826 2830 437 1305 7355\n", " 381 4722 1833 3147 6938 434 5430 1378 3118 3703 3678 2978 7128 4975 6618\n", " 6934 3091 2125 6483 1334 1382 1311 5492 3320 7334 1110 1890 3506 5827 1477\n", " 2460 1732 5356 4051 4138 6349 6395 5364 4066 2599 2480 4374 190 3893 4299\n", " 2681 4451 1747 5361 6164 7651 5341 2316 4049 2595 5947 5055 7549 1604 4227\n", " 5082 4046 2567 5923 2649 4232 5148 736 7707 7536 2364 5176 138 4149 6285\n", " 5301 2304 7712 4084 2250 90 1572 1195 2279 4419 2704 5133 4265 7471 759\n", " 6208 4112 714 667 5290 7466 2387 5308 4477 944 7581 946 5218 7729 4110\n", " 6119 1578 4104 100 5324 1566 4458 5931 2423 4331 5887 4330 5101 1143 5929\n", " 6082 2609 3119 1871 4009 4877 5468 4013 69 803 3589 987 5067 4880 2144\n", " 5350 5472 7709 2287 6690 6195 539 2141 1623 7687 903 7149 2325 7666 5103\n", " 4702 4106 3061 3024 3027 501 3443 5705 6796 4905 1629 7675 1919 6806 3431\n", " 779 6766 7225 2949 6288 3400 5484 1165 5945 7237 5038 6841 121 2080 6833\n", " 7218 6755 5495 5496 514 2981 2342 780 2167 4003 688 3745 7399 2785 6418\n", " 1463 4235 5818 5908 5372 7383 236 3720 5815 2024 5909 3937 1830 7730 4437\n", " 4864 2812 7365 3949 5632 7402 1040 2034 1233 5215 4497 220 3805 7442 221\n", " 6461 1573 3791 6508 3849 6511 2234 3850 2695 4471 1225 22 4294 7493 7499\n", " 7773 1368 5802 4376 4421 7574 6630 4219 7763 1845 6352 5782 153 864 5637\n", " 1197 701 3626 2652 6660 5461 7354 6635 4431 2902 6916 4608 5058 4323 3163\n", " 5162 5774 1150 20 4754 6205 56 388 1935 5613 1242 2050 7081 5241 4203\n", " 327 334 2407 6061 2416 6165 2346 818 4348 3108 3109 4883 5569 3407 2031\n", " 7129 5992 825 6117 3392 6034 5679 2459 486 3367 4305 7070 6947 2485 1405\n", " 3299 3321 1549 2020 7689 4713 2353 7230 3464 3641 1510 6612 5925 5442 5796\n", " 1710 3652 2271 7327 3647 4597 166 5064 4949 5354 5465 4945 4018 7274 5473\n", " 6704 7265 6608 7362 6306 2228 3814 5399 4485 5872 6463 1767 203 4469 4518\n", " 3756 2227 247 5043 6426 2687 2685 967 6408 5374 4545 3924 7369 134 6657\n", " 1571 7630 319 7235 7177 109 2557 7232 1095 5706 2112 494 6752 4056 6251\n", " 1264 4086 2998 4403 6256 2559 752 3535 6228 7252 2943 4656 1002 6062 402\n", " 5970 619 4292 1901 6220 1994 4715 624 3467 4498 1738 6776 1904 6092 6088\n", " 4829 1215 6982 4704 3840 4473 1569 1574 672 6990 3757 6090 6798 6971 671\n", " 1021 6392 3639 6801 4605 713 4006 4196 4675 984 3593 6688 6724 1866 6298\n", " 3573 6875 6705 475 917 4026 5937 1953 6651 6585 6345 1518 5807 4433 4067\n", " 4164 1927 5951 974 1716 1711 4366 4689 6355 4567 863 6627 3653 4361 5996\n", " 4560 6937 7210 7543 1187 1299 4911 2643 1156 2203 1276 5042 7136 7755 5434\n", " 7352 7351 5502 5130 4998 3124 7766 5197 7093 7407 5047 5309 2794 7524 2445\n", " 7525 2097 7387 2684 4990 2214 169 7376 167 1302 2666 1301 2604 177 7727\n", " 7454 7811 7655 7439 7432 1136 270 300 1101 2404 2138 2896 2894 45 7101\n", " 288 2714 4960 7450 202 4844 204 5040 2728 1096 7473 7470 5697 6098 5024\n", " 2091 831 828 5422 6854 953 1807 7431 7700 403 3424 4510 4896 6842 1017\n", " 1799 615 1553 1011 3736 2856 2129 4953 806 5272 5454 2882 7312 3617 5462\n", " 559 1440 2914 5469 865 2921 2942 6715 1557 5276 990 6025 7330 1891 5701\n", " 250 3451 3455 1081 5816 4987 2099 2727 6583 4549 3028 3707 598 2106 244\n", " 4172 3687 3489 3011 5498 6761 1120 3380 2343 1784 89 2616 427 3294 6953\n", " 1698 718 2620 6341 6351 704 2266 1782 6360 1718 4379 7542 1720 5918 5237\n", " 1336 1735 1038 6921 7587 386 6219 7040 6257 1522 3252 6267 6992 5095 2546\n", " 1672 411 1265 412 1167 7623 1047 1676 3276 4854]\n" ] } ], "prompt_number": 99 }, { "cell_type": "code", "collapsed": false, "input": [ "filtered_X = XX[:, important_words]\n", "print filtered_X.shape" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(100000, 1000)\n" ] } ], "prompt_number": 96 }, { "cell_type": "code", "collapsed": false, "input": [ "train_X, test_X, train_y, test_y, train_texts, test_texts = train_test_split(filtered_X, yy, texts, test_size = 0.3)\n", "print train_X.shape, test_X.shape\n", "print train_y.shape, test_y.shape" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(70000, 1000) (30000, 1000)\n", "(70000,) (30000,)\n" ] } ], "prompt_number": 97 }, { "cell_type": "code", "collapsed": false, "input": [ "sgd = SGDClassifier(loss='log', penalty='l2', )\n", "alphas = [1e-7, 3e-7, 1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3, 1e-2, ]\n", "for alpha in alphas:\n", " sgd.set_params(alpha = alpha)\n", " scores = cross_val_score(sgd, train_X, train_y, cv = 5)\n", " print alpha, np.mean(scores), np.std(scores)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "1e-07 0.497671428571 0.000510302031414\n", "3e-07" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 0.497628571429 0.000649018096663\n", "1e-06" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 0.497485714286 0.00058623670082\n", "3e-06" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 0.497614285714 0.000910931346682\n", "1e-05" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 0.497614285714 0.000425225030461\n", "3e-05" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 0.497814285714 0.00059005361223\n", "0.0001" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 0.497985714286 0.000563154381269\n", "0.0003" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 0.499114285714 0.000132480264221\n", "0.001" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 0.501042857143 0.00031815796359\n", "0.003" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 0.501514285714 2.85714285714e-05\n", "0.01" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 0.497857142857 0.00125030608497\n" ] } ], "prompt_number": 98 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }