{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "sns.set()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_df = pd.read_csv(\"article.txt\", sep='\\n', header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def line_process(s):\n",
    "    s = s.lower()\n",
    "    s = re.sub(\"[\\)\\(\\.,—\\-:«»\\t!]\", \" \", s)\n",
    "    while s.find(\"  \") != -1:\n",
    "        s = re.sub(\"  \", \" \", s)\n",
    "    s = s.strip()\n",
    "    return s\n",
    "df_1 = raw_df[0].apply(line_process)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_arr = np.concatenate(df_1.apply(lambda s: s.split()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>2277</td>\n",
       "      <td>2277.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>1062</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>и</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>91</td>\n",
       "      <td>2277.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        word   count\n",
       "count   2277  2277.0\n",
       "unique  1062     1.0\n",
       "top        и     1.0\n",
       "freq      91  2277.0"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(\n",
    "                  data=[df_arr[:], np.ones(df_arr.size)[:]]\n",
    "                 ).transpose().rename({0:'word', 1:'count'}, axis='columns')\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>word</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      count\n",
       "word       \n",
       "0         2\n",
       "10        2\n",
       "100       1\n",
       "13        2\n",
       "14        1"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_count = df.groupby('word').count()\n",
    "df_count.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>word</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>меня</th>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>мне</th>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>мной</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>я</th>\n",
       "      <td>69</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      count\n",
       "word       \n",
       "меня     24\n",
       "мне       6\n",
       "мной      1\n",
       "я        69"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ya = df[(df['word'] == \"я\") |\n",
    "        (df['word'] == \"меня\") | \n",
    "        (df['word'] == 'мне') |\n",
    "        (df['word'] == 'мной')]\n",
    "ya.groupby('word').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>word</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>мы</th>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>нам</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>нас</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      count\n",
       "word       \n",
       "мы        7\n",
       "нам       1\n",
       "нас       3"
      ]
     },
     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nas = df[(df['word'] == \"мы\") |\n",
    "        (df['word'] == \"нам\") | \n",
    "        (df['word'] == 'нас') |\n",
    "        (df['word'] == 'нами')]\n",
    "nas.groupby('word').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>word</th>\n",
       "      <th>вы</th>\n",
       "      <th>меня</th>\n",
       "      <th>мне</th>\n",
       "      <th>мной</th>\n",
       "      <th>мы</th>\n",
       "      <th>нам</th>\n",
       "      <th>нас</th>\n",
       "      <th>он</th>\n",
       "      <th>она</th>\n",
       "      <th>они</th>\n",
       "      <th>свои</th>\n",
       "      <th>себя</th>\n",
       "      <th>те</th>\n",
       "      <th>тех</th>\n",
       "      <th>ты</th>\n",
       "      <th>я</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1</td>\n",
       "      <td>24</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>11</td>\n",
       "      <td>69</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "word   вы  меня  мне  мной  мы  нам  нас  он  она  они  свои  себя  те  тех  \\\n",
       "count   1    24    6     1   7    1    3   4    5    5     3     5   2    1   \n",
       "\n",
       "word   ты   я  \n",
       "count  11  69  "
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pronouns = df[(df['word'] == \"я\") |\n",
    "              (df['word'] == \"меня\") | \n",
    "              (df['word'] == 'мне') |\n",
    "              (df['word'] == 'мной') |\n",
    "              (df['word'] == \"мы\") |\n",
    "              (df['word'] == \"нам\") | \n",
    "              (df['word'] == 'нас') |\n",
    "              (df['word'] == 'нами') |\n",
    "              (df['word'] == \"ты\") |\n",
    "              (df['word'] == \"они\") | \n",
    "              (df['word'] == 'тех') |\n",
    "              (df['word'] == 'те') |\n",
    "              (df['word'] == 'она') |\n",
    "              (df['word'] == 'он') |\n",
    "              (df['word'] == 'оно') |\n",
    "              (df['word'] == 'те') |\n",
    "              (df['word'] == 'вы') |\n",
    "              (df['word'] == 'себя') |\n",
    "              (df['word'] == 'свои')]\n",
    "pronouns.groupby('word').count().transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    34\n",
       "dtype: int64"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "te = df[(df['word'] == \"ты\") |\n",
    "        (df['word'] == \"они\") | \n",
    "        (df['word'] == 'тех') |\n",
    "        (df['word'] == 'те') |\n",
    "        (df['word'] == 'она') |\n",
    "        (df['word'] == 'он') |\n",
    "        (df['word'] == 'оно') |\n",
    "        (df['word'] == 'те') |\n",
    "        (df['word'] == 'вы') |\n",
    "        (df['word'] == 'себя')]\n",
    "te.groupby('word').count().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>word</th>\n",
       "      <th>ciklum</th>\n",
       "      <th>epam</th>\n",
       "      <th>itransition</th>\n",
       "      <th>juno</th>\n",
       "      <th>tieto</th>\n",
       "      <th>tietoenator</th>\n",
       "      <th>viber</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>14</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "word   ciklum  epam  itransition  juno  tieto  tietoenator  viber\n",
       "count       5     4            1    14      2            1      3"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "companies = df[(df['word'] == \"viber\") |\n",
    "               (df['word'] == \"juno\") | \n",
    "               (df['word'] == 'epam') |\n",
    "               (df['word'] == 'ciklum') |\n",
    "               (df['word'] == 'tietoenator') |\n",
    "               (df['word'] == 'tieto') |\n",
    "               (df['word'] == 'itransition')]\n",
    "companies_count = companies.groupby('word').count()\n",
    "companies_count.transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "word     14\n",
       "count    14\n",
       "dtype: int64"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "companies[companies['word'] == 'juno'].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "word     16\n",
       "count    16\n",
       "dtype: int64"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "companies[~(companies['word'] == 'juno')].count()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}