{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'3.6.7'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from platform import python_version\n",
    "python_version()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import multiprocessing\n",
    "import nltk\n",
    "import numpy as np\n",
    "import sklearn\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('0.25.1', '3.4.5', '1.16.3', '0.21.1')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.__version__, nltk.__version__, np.__version__, sklearn.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from multiprocessing import Pool"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import brown"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## generate a dataset that 4 times as big as the brown corpus by generating random permutations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_texts():\n",
    "    return [\" \".join(np.random.permutation(sents)) for sents in brown.sents()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "brown_df = pd.DataFrame({\n",
    "    'text': make_texts() + make_texts() + make_texts() + make_texts()\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe tex2jax_ignore\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>51747</td>\n",
       "      <td>a train riding '' I been for . ways now</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>178451</td>\n",
       "      <td>be while fallout who should selfish because had build would I . Sir a he to in for no his sit read man of his shelters a felt it not neighbors -- secure shelter him home</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>132362</td>\n",
       "      <td>room my ? To ?</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                                                             text\n",
       "51747                                                                                                                                     a train riding '' I been for . ways now\n",
       "178451  be while fallout who should selfish because had build would I . Sir a he to in for no his sit read man of his shelters a felt it not neighbors -- secure shelter him home\n",
       "132362                                                                                                                                                             room my ? To ?"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "brown_df.sample(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(229360, 1)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "brown_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_lowercase(input_string):\n",
    "    return input_string.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def replace_digits_with_token(input_string):\n",
    "    return re.sub(r\"\\b\\d+\\b\",\"tok_num\", input_string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_text_length(input_string):\n",
    "    return len(re.split(r\"(?:\\s+)|(?:,)|(?:\\-)\",input_string))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_df(df):\n",
    "    \n",
    "    output_df = df.copy()\n",
    "    \n",
    "    # replace weird double quotes with normal ones\n",
    "    output_df['text']      = output_df['text'].apply(lambda text: text.replace(\"``\",'\"'))\n",
    "\n",
    "    # text to lower case\n",
    "    output_df['text']      = output_df['text'].apply(lambda text: text.lower())\n",
    "    \n",
    "    # replace number with a special token\n",
    "    output_df['text']      = output_df['text'].apply(lambda text: re.sub(r\"\\b\\d+\\b\",\"tok_num\", text))\n",
    "    \n",
    "    # take out texts that are too large or too small\n",
    "    output_df['num_words'] = output_df['text'].apply(lambda text: len(re.split(r\"(?:\\s+)|(?:,)|(?:\\-)\",text)))   \n",
    "        \n",
    "    indices_to_remove_too_large = output_df[output_df['num_words'] > 50]\n",
    "    output_df.drop(indices_to_remove_too_large.index, inplace=True)\n",
    "    \n",
    "    indices_to_remove_too_small = output_df[output_df['num_words'] < 10]\n",
    "    output_df.drop(indices_to_remove_too_small.index, inplace=True)    \n",
    "    \n",
    "    output_df.reset_index(drop=True, inplace=True)\n",
    "    \n",
    "    return output_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.55 s, sys: 16 ms, total: 2.56 s\n",
      "Wall time: 2.56 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "processed_df = process_df(brown_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe tex2jax_ignore\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>num_words</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>\" investigation election recent evidence atlanta's jury friday irregularities of that any place said county '' produced no . primary the an took fulton grand</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>of had of . atlanta the , in , city city conducted '' over-all the \" deserves term-end the committee which the and presentments election which executive election the for the praise that further thanks the in was charge of jury said manner</td>\n",
       "      <td>47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>been hard-fought irregularities reports '' pye possible in of by by . allen court jury durwood jr. judge term had which won superior to mayor-nominate investigate charged the ivan primary \" fulton september-october was the</td>\n",
       "      <td>38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>'' election of a the city in , . was the such voters jury the '' considering reports , relative , only and handful widespread of size number of this the the said interest received \" \"</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>it and are ambiguous did many . outmoded the registration georgia's find jury that said or '' laws election of and inadequate \" often</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                                                                                                                             text  \\\n",
       "0                                                                                   \" investigation election recent evidence atlanta's jury friday irregularities of that any place said county '' produced no . primary the an took fulton grand   \n",
       "1  of had of . atlanta the , in , city city conducted '' over-all the \" deserves term-end the committee which the and presentments election which executive election the for the praise that further thanks the in was charge of jury said manner   \n",
       "2                  been hard-fought irregularities reports '' pye possible in of by by . allen court jury durwood jr. judge term had which won superior to mayor-nominate investigate charged the ivan primary \" fulton september-october was the   \n",
       "3                                                         '' election of a the city in , . was the such voters jury the '' considering reports , relative , only and handful widespread of size number of this the the said interest received \" \"   \n",
       "4                                                                                                           it and are ambiguous did many . outmoded the registration georgia's find jury that said or '' laws election of and inadequate \" often   \n",
       "\n",
       "   num_words  \n",
       "0         25  \n",
       "1         47  \n",
       "2         38  \n",
       "3         40  \n",
       "4         24  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(174440, 2)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed_df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## parallel version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_CORES = 8\n",
    "df_chunks = np.array_split(brown_df,NUM_CORES)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 172 ms, sys: 124 ms, total: 296 ms\n",
      "Wall time: 1.02 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "with multiprocessing.Pool(NUM_CORES) as pool:\n",
    "    processed_df = pd.concat(pool.map(process_df, df_chunks), ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(174440, 2)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed_df.shape"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6",
   "language": "python",
   "name": "python36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}