{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import re\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"sns.set()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"raw_df = pd.read_csv(\"article.txt\", sep='\\n', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def line_process(s):\n",
" s = s.lower()\n",
" s = re.sub(\"[\\)\\(\\.,—\\-:«»\\t!]\", \" \", s)\n",
" while s.find(\" \") != -1:\n",
" s = re.sub(\" \", \" \", s)\n",
" s = s.strip()\n",
" return s\n",
"df_1 = raw_df[0].apply(line_process)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df_arr = np.concatenate(df_1.apply(lambda s: s.split()))"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" word | \n",
" count | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 2277 | \n",
" 2277.0 | \n",
"
\n",
" \n",
" unique | \n",
" 1062 | \n",
" 1.0 | \n",
"
\n",
" \n",
" top | \n",
" и | \n",
" 1.0 | \n",
"
\n",
" \n",
" freq | \n",
" 91 | \n",
" 2277.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" word count\n",
"count 2277 2277.0\n",
"unique 1062 1.0\n",
"top и 1.0\n",
"freq 91 2277.0"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(\n",
" data=[df_arr[:], np.ones(df_arr.size)[:]]\n",
" ).transpose().rename({0:'word', 1:'count'}, axis='columns')\n",
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
"
\n",
" \n",
" word | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" 10 | \n",
" 2 | \n",
"
\n",
" \n",
" 100 | \n",
" 1 | \n",
"
\n",
" \n",
" 13 | \n",
" 2 | \n",
"
\n",
" \n",
" 14 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count\n",
"word \n",
"0 2\n",
"10 2\n",
"100 1\n",
"13 2\n",
"14 1"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_count = df.groupby('word').count()\n",
"df_count.head()"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
"
\n",
" \n",
" word | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" меня | \n",
" 24 | \n",
"
\n",
" \n",
" мне | \n",
" 6 | \n",
"
\n",
" \n",
" мной | \n",
" 1 | \n",
"
\n",
" \n",
" я | \n",
" 69 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count\n",
"word \n",
"меня 24\n",
"мне 6\n",
"мной 1\n",
"я 69"
]
},
"execution_count": 130,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ya = df[(df['word'] == \"я\") |\n",
" (df['word'] == \"меня\") | \n",
" (df['word'] == 'мне') |\n",
" (df['word'] == 'мной')]\n",
"ya.groupby('word').count()"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
"
\n",
" \n",
" word | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" мы | \n",
" 7 | \n",
"
\n",
" \n",
" нам | \n",
" 1 | \n",
"
\n",
" \n",
" нас | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count\n",
"word \n",
"мы 7\n",
"нам 1\n",
"нас 3"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nas = df[(df['word'] == \"мы\") |\n",
" (df['word'] == \"нам\") | \n",
" (df['word'] == 'нас') |\n",
" (df['word'] == 'нами')]\n",
"nas.groupby('word').count()"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" word | \n",
" вы | \n",
" меня | \n",
" мне | \n",
" мной | \n",
" мы | \n",
" нам | \n",
" нас | \n",
" он | \n",
" она | \n",
" они | \n",
" свои | \n",
" себя | \n",
" те | \n",
" тех | \n",
" ты | \n",
" я | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 1 | \n",
" 24 | \n",
" 6 | \n",
" 1 | \n",
" 7 | \n",
" 1 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 5 | \n",
" 3 | \n",
" 5 | \n",
" 2 | \n",
" 1 | \n",
" 11 | \n",
" 69 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"word вы меня мне мной мы нам нас он она они свои себя те тех \\\n",
"count 1 24 6 1 7 1 3 4 5 5 3 5 2 1 \n",
"\n",
"word ты я \n",
"count 11 69 "
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pronouns = df[(df['word'] == \"я\") |\n",
" (df['word'] == \"меня\") | \n",
" (df['word'] == 'мне') |\n",
" (df['word'] == 'мной') |\n",
" (df['word'] == \"мы\") |\n",
" (df['word'] == \"нам\") | \n",
" (df['word'] == 'нас') |\n",
" (df['word'] == 'нами') |\n",
" (df['word'] == \"ты\") |\n",
" (df['word'] == \"они\") | \n",
" (df['word'] == 'тех') |\n",
" (df['word'] == 'те') |\n",
" (df['word'] == 'она') |\n",
" (df['word'] == 'он') |\n",
" (df['word'] == 'оно') |\n",
" (df['word'] == 'те') |\n",
" (df['word'] == 'вы') |\n",
" (df['word'] == 'себя') |\n",
" (df['word'] == 'свои')]\n",
"pronouns.groupby('word').count().transpose()"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 34\n",
"dtype: int64"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"te = df[(df['word'] == \"ты\") |\n",
" (df['word'] == \"они\") | \n",
" (df['word'] == 'тех') |\n",
" (df['word'] == 'те') |\n",
" (df['word'] == 'она') |\n",
" (df['word'] == 'он') |\n",
" (df['word'] == 'оно') |\n",
" (df['word'] == 'те') |\n",
" (df['word'] == 'вы') |\n",
" (df['word'] == 'себя')]\n",
"te.groupby('word').count().sum()"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" word | \n",
" ciklum | \n",
" epam | \n",
" itransition | \n",
" juno | \n",
" tieto | \n",
" tietoenator | \n",
" viber | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 5 | \n",
" 4 | \n",
" 1 | \n",
" 14 | \n",
" 2 | \n",
" 1 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"word ciklum epam itransition juno tieto tietoenator viber\n",
"count 5 4 1 14 2 1 3"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"companies = df[(df['word'] == \"viber\") |\n",
" (df['word'] == \"juno\") | \n",
" (df['word'] == 'epam') |\n",
" (df['word'] == 'ciklum') |\n",
" (df['word'] == 'tietoenator') |\n",
" (df['word'] == 'tieto') |\n",
" (df['word'] == 'itransition')]\n",
"companies_count = companies.groupby('word').count()\n",
"companies_count.transpose()"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"word 14\n",
"count 14\n",
"dtype: int64"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"companies[companies['word'] == 'juno'].count()"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"word 16\n",
"count 16\n",
"dtype: int64"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"companies[~(companies['word'] == 'juno')].count()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}