{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'3.6.7'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from platform import python_version\n", "python_version()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import multiprocessing\n", "import nltk\n", "import numpy as np\n", "import sklearn\n", "import re" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('0.25.1', '3.4.5', '1.16.3', '0.21.1')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.__version__, nltk.__version__, np.__version__, sklearn.__version__" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from multiprocessing import Pool" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import brown" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## generate a dataset that 4 times as big as the brown corpus by generating random permutations" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def make_texts():\n", " return [\" \".join(np.random.permutation(sents)) for sents in brown.sents()]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "brown_df = pd.DataFrame({\n", " 'text': make_texts() + make_texts() + make_texts() + make_texts()\n", "})" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
text
51747a train riding '' I been for . ways now
178451be while fallout who should selfish because had build would I . Sir a he to in for no his sit read man of his shelters a felt it not neighbors -- secure shelter him home
132362room my ? To ?
\n", "
" ], "text/plain": [ " text\n", "51747 a train riding '' I been for . ways now\n", "178451 be while fallout who should selfish because had build would I . Sir a he to in for no his sit read man of his shelters a felt it not neighbors -- secure shelter him home\n", "132362 room my ? To ?" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "brown_df.sample(3)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(229360, 1)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "brown_df.shape" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def to_lowercase(input_string):\n", " return input_string.lower()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def replace_digits_with_token(input_string):\n", " return re.sub(r\"\\b\\d+\\b\",\"tok_num\", input_string)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "def get_text_length(input_string):\n", " return len(re.split(r\"(?:\\s+)|(?:,)|(?:\\-)\",input_string))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "def process_df(df):\n", " \n", " output_df = df.copy()\n", " \n", " # replace weird double quotes with normal ones\n", " output_df['text'] = output_df['text'].apply(lambda text: text.replace(\"``\",'\"'))\n", "\n", " # text to lower case\n", " output_df['text'] = output_df['text'].apply(lambda text: text.lower())\n", " \n", " # replace number with a special token\n", " output_df['text'] = output_df['text'].apply(lambda text: re.sub(r\"\\b\\d+\\b\",\"tok_num\", text))\n", " \n", " # take out texts that are too large or too small\n", " output_df['num_words'] = output_df['text'].apply(lambda text: len(re.split(r\"(?:\\s+)|(?:,)|(?:\\-)\",text))) \n", " \n", " indices_to_remove_too_large = output_df[output_df['num_words'] > 50]\n", " output_df.drop(indices_to_remove_too_large.index, inplace=True)\n", " \n", " indices_to_remove_too_small = output_df[output_df['num_words'] < 10]\n", " output_df.drop(indices_to_remove_too_small.index, inplace=True) \n", " \n", " output_df.reset_index(drop=True, inplace=True)\n", " \n", " return output_df" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.55 s, sys: 16 ms, total: 2.56 s\n", "Wall time: 2.56 s\n" ] } ], "source": [ "%%time\n", "processed_df = process_df(brown_df)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textnum_words
0\" investigation election recent evidence atlanta's jury friday irregularities of that any place said county '' produced no . primary the an took fulton grand25
1of had of . atlanta the , in , city city conducted '' over-all the \" deserves term-end the committee which the and presentments election which executive election the for the praise that further thanks the in was charge of jury said manner47
2been hard-fought irregularities reports '' pye possible in of by by . allen court jury durwood jr. judge term had which won superior to mayor-nominate investigate charged the ivan primary \" fulton september-october was the38
3'' election of a the city in , . was the such voters jury the '' considering reports , relative , only and handful widespread of size number of this the the said interest received \" \"40
4it and are ambiguous did many . outmoded the registration georgia's find jury that said or '' laws election of and inadequate \" often24
\n", "
" ], "text/plain": [ " text \\\n", "0 \" investigation election recent evidence atlanta's jury friday irregularities of that any place said county '' produced no . primary the an took fulton grand \n", "1 of had of . atlanta the , in , city city conducted '' over-all the \" deserves term-end the committee which the and presentments election which executive election the for the praise that further thanks the in was charge of jury said manner \n", "2 been hard-fought irregularities reports '' pye possible in of by by . allen court jury durwood jr. judge term had which won superior to mayor-nominate investigate charged the ivan primary \" fulton september-october was the \n", "3 '' election of a the city in , . was the such voters jury the '' considering reports , relative , only and handful widespread of size number of this the the said interest received \" \" \n", "4 it and are ambiguous did many . outmoded the registration georgia's find jury that said or '' laws election of and inadequate \" often \n", "\n", " num_words \n", "0 25 \n", "1 47 \n", "2 38 \n", "3 40 \n", "4 24 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "processed_df.head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(174440, 2)" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "processed_df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## parallel version" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "NUM_CORES = 8\n", "df_chunks = np.array_split(brown_df,NUM_CORES)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 172 ms, sys: 124 ms, total: 296 ms\n", "Wall time: 1.02 s\n" ] } ], "source": [ "%%time\n", "\n", "with multiprocessing.Pool(NUM_CORES) as pool:\n", " processed_df = pd.concat(pool.map(process_df, df_chunks), ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(174440, 2)" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "processed_df.shape" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.6", "language": "python", "name": "python36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }