{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from __future__ import print_function\n", "from __future__ import division\n", "\n", "from tqdm import *\n", "import csv\n", "import numpy as np\n", "import pandas as pd\n", "import scipy\n", "import nltk\n", "import sklearn\n", "import random\n", "import re\n", "from sklearn.metrics import f1_score, precision_score, recall_score\n", "from sklearn.linear_model import LogisticRegression, LinearRegression\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.decomposition import PCA, RandomizedPCA\n", "from sklearn import svm\n", "from sklearn.grid_search import GridSearchCV, ParameterGrid\n", "from sklearn.pipeline import Pipeline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "path_to_csv_data = \"/home/felipe/auto-tagger/data/RawRCV1/csv/reuters-rcv1-full.csv\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 38\u001b[0m \u001b[0mseries\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mout_tags\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 39\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 40\u001b[1;33m \u001b[0mpartial_Y_df\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mseries\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_dummies\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m','\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0muint8\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 41\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[1;32mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpartial_Y_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/strings.pyc\u001b[0m in \u001b[0;36mget_dummies\u001b[1;34m(self, sep)\u001b[0m\n\u001b[0;32m 1624\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstr_get_dummies\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1625\u001b[0m return self._wrap_result(result, use_codes=(not self._is_categorical),\n\u001b[1;32m-> 1626\u001b[1;33m name=name, expand=True)\n\u001b[0m\u001b[0;32m 1627\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1628\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstr_translate\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/strings.pyc\u001b[0m in \u001b[0;36m_wrap_result\u001b[1;34m(self, result, use_codes, name, expand)\u001b[0m\n\u001b[0;32m 1391\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mexpand\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1392\u001b[0m \u001b[0mcons\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_orig\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_constructor_expanddim\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1393\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mcons\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1394\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1395\u001b[0m \u001b[1;31m# Must be a Series\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[0;32m 303\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mis_named_tuple\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 304\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fields\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 305\u001b[1;33m \u001b[0marrays\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_to_arrays\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 306\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_ensure_index\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 307\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_to_arrays\u001b[1;34m(data, columns, coerce_float, dtype)\u001b[0m\n\u001b[0;32m 5550\u001b[0m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtuple\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5551\u001b[0m return _list_to_arrays(data, columns, coerce_float=coerce_float,\n\u001b[1;32m-> 5552\u001b[1;33m dtype=dtype)\n\u001b[0m\u001b[0;32m 5553\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5554\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_list_to_arrays\u001b[1;34m(data, columns, coerce_float, dtype)\u001b[0m\n\u001b[0;32m 5607\u001b[0m \u001b[0mcontent\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_object_array\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mT\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5608\u001b[0m return _convert_object_array(content, columns, dtype=dtype,\n\u001b[1;32m-> 5609\u001b[1;33m coerce_float=coerce_float)\n\u001b[0m\u001b[0;32m 5610\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5611\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_convert_object_array\u001b[1;34m(content, columns, coerce_float, dtype)\u001b[0m\n\u001b[0;32m 5675\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5676\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 5677\u001b[1;33m \u001b[0marrays\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mconvert\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0marr\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mcontent\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5678\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5679\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0marrays\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36mconvert\u001b[1;34m(arr)\u001b[0m\n\u001b[0;32m 5671\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mconvert\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5672\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdtype\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mobject\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mdtype\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobject\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 5673\u001b[1;33m \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmaybe_convert_objects\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtry_float\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcoerce_float\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5674\u001b[0m \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_possibly_cast_to_datetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5675\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "# documents = []\n", "# tags = []\n", "\n", "chunks = [\n", " [0,100000],\n", " [100001,200000],\n", " [200001,300000],\n", " [300001,400000],\n", " [400001,500000],\n", " [500001,600000],\n", " [600001,700000],\n", " [700001,800000],\n", " [800001,900000]\n", "]\n", " \n", "# http://stackoverflow.com/a/654046/436721\n", "def read_and_save(begin,end,tags):\n", " with open(path_to_csv_data) as file:\n", " reader = csv.reader(file,escapechar='\\\\') \n", " \n", " for (i,line) in enumerate(reader):\n", " \n", " if(i < begin):\n", " continue\n", " \n", " if(i >= end):\n", " return tags\n", " \n", " (id,title,body,labels) = line\n", " \n", " text = title+\" \"+body\n", " \n", "# documents.append(text)\n", " tags.append(labels)\n", "\n", "for fromindex,toindex in chunks:\n", " out_tags = read_and_save(fromindex,toindex,[])\n", " series = pd.Series(out_tags)\n", " \n", " partial_Y_df = series.str.get_dummies(sep=',').astype(np.uint8)\n", " \n", " print(partial_Y_df.info()) " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "(len(documents),len(tags))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "tags[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def preprocessor(string):\n", " repl = re.sub('<','',string)\n", " repl = re.sub('<[^>]+>','',string)\n", " return repl.lower()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "series = pd.Series(tags)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# split = series.str.split(',');split" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "Y_df = series.str.get_dummies(sep=',').astype(np.uint8)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "Y_df.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "Y_df =pd.get_dummies(series.str.split(',')).astype(np.uint8)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "Y_df.head(1).iloc[0].values" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }