{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'3.6.7'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from platform import python_version\n",
"python_version()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import multiprocessing\n",
"import nltk\n",
"import numpy as np\n",
"import sklearn\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('0.25.1', '3.4.5', '1.16.3', '0.21.1')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.__version__, nltk.__version__, np.__version__, sklearn.__version__"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from multiprocessing import Pool"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import brown"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## generate a dataset that 4 times as big as the brown corpus by generating random permutations"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def make_texts():\n",
" return [\" \".join(np.random.permutation(sents)) for sents in brown.sents()]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"brown_df = pd.DataFrame({\n",
" 'text': make_texts() + make_texts() + make_texts() + make_texts()\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" text | \n",
"
\n",
" \n",
" \n",
" \n",
" 51747 | \n",
" a train riding '' I been for . ways now | \n",
"
\n",
" \n",
" 178451 | \n",
" be while fallout who should selfish because had build would I . Sir a he to in for no his sit read man of his shelters a felt it not neighbors -- secure shelter him home | \n",
"
\n",
" \n",
" 132362 | \n",
" room my ? To ? | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" text\n",
"51747 a train riding '' I been for . ways now\n",
"178451 be while fallout who should selfish because had build would I . Sir a he to in for no his sit read man of his shelters a felt it not neighbors -- secure shelter him home\n",
"132362 room my ? To ?"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"brown_df.sample(3)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(229360, 1)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"brown_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def to_lowercase(input_string):\n",
" return input_string.lower()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def replace_digits_with_token(input_string):\n",
" return re.sub(r\"\\b\\d+\\b\",\"tok_num\", input_string)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def get_text_length(input_string):\n",
" return len(re.split(r\"(?:\\s+)|(?:,)|(?:\\-)\",input_string))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"def process_df(df):\n",
" \n",
" output_df = df.copy()\n",
" \n",
" # replace weird double quotes with normal ones\n",
" output_df['text'] = output_df['text'].apply(lambda text: text.replace(\"``\",'\"'))\n",
"\n",
" # text to lower case\n",
" output_df['text'] = output_df['text'].apply(lambda text: text.lower())\n",
" \n",
" # replace number with a special token\n",
" output_df['text'] = output_df['text'].apply(lambda text: re.sub(r\"\\b\\d+\\b\",\"tok_num\", text))\n",
" \n",
" # take out texts that are too large or too small\n",
" output_df['num_words'] = output_df['text'].apply(lambda text: len(re.split(r\"(?:\\s+)|(?:,)|(?:\\-)\",text))) \n",
" \n",
" indices_to_remove_too_large = output_df[output_df['num_words'] > 50]\n",
" output_df.drop(indices_to_remove_too_large.index, inplace=True)\n",
" \n",
" indices_to_remove_too_small = output_df[output_df['num_words'] < 10]\n",
" output_df.drop(indices_to_remove_too_small.index, inplace=True) \n",
" \n",
" output_df.reset_index(drop=True, inplace=True)\n",
" \n",
" return output_df"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.55 s, sys: 16 ms, total: 2.56 s\n",
"Wall time: 2.56 s\n"
]
}
],
"source": [
"%%time\n",
"processed_df = process_df(brown_df)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" text | \n",
" num_words | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" \" investigation election recent evidence atlanta's jury friday irregularities of that any place said county '' produced no . primary the an took fulton grand | \n",
" 25 | \n",
"
\n",
" \n",
" 1 | \n",
" of had of . atlanta the , in , city city conducted '' over-all the \" deserves term-end the committee which the and presentments election which executive election the for the praise that further thanks the in was charge of jury said manner | \n",
" 47 | \n",
"
\n",
" \n",
" 2 | \n",
" been hard-fought irregularities reports '' pye possible in of by by . allen court jury durwood jr. judge term had which won superior to mayor-nominate investigate charged the ivan primary \" fulton september-october was the | \n",
" 38 | \n",
"
\n",
" \n",
" 3 | \n",
" '' election of a the city in , . was the such voters jury the '' considering reports , relative , only and handful widespread of size number of this the the said interest received \" \" | \n",
" 40 | \n",
"
\n",
" \n",
" 4 | \n",
" it and are ambiguous did many . outmoded the registration georgia's find jury that said or '' laws election of and inadequate \" often | \n",
" 24 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" text \\\n",
"0 \" investigation election recent evidence atlanta's jury friday irregularities of that any place said county '' produced no . primary the an took fulton grand \n",
"1 of had of . atlanta the , in , city city conducted '' over-all the \" deserves term-end the committee which the and presentments election which executive election the for the praise that further thanks the in was charge of jury said manner \n",
"2 been hard-fought irregularities reports '' pye possible in of by by . allen court jury durwood jr. judge term had which won superior to mayor-nominate investigate charged the ivan primary \" fulton september-october was the \n",
"3 '' election of a the city in , . was the such voters jury the '' considering reports , relative , only and handful widespread of size number of this the the said interest received \" \" \n",
"4 it and are ambiguous did many . outmoded the registration georgia's find jury that said or '' laws election of and inadequate \" often \n",
"\n",
" num_words \n",
"0 25 \n",
"1 47 \n",
"2 38 \n",
"3 40 \n",
"4 24 "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"processed_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(174440, 2)"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"processed_df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## parallel version"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"NUM_CORES = 8\n",
"df_chunks = np.array_split(brown_df,NUM_CORES)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 172 ms, sys: 124 ms, total: 296 ms\n",
"Wall time: 1.02 s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"with multiprocessing.Pool(NUM_CORES) as pool:\n",
" processed_df = pd.concat(pool.map(process_df, df_chunks), ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(174440, 2)"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"processed_df.shape"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}