{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import re\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "sns.set()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "raw_df = pd.read_csv(\"article.txt\", sep='\\n', header=None)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def line_process(s):\n", " s = s.lower()\n", " s = re.sub(\"[\\)\\(\\.,—\\-:«»\\t!]\", \" \", s)\n", " while s.find(\" \") != -1:\n", " s = re.sub(\" \", \" \", s)\n", " s = s.strip()\n", " return s\n", "df_1 = raw_df[0].apply(line_process)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df_arr = np.concatenate(df_1.apply(lambda s: s.split()))" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
count22772277.0
unique10621.0
topи1.0
freq912277.0
\n", "
" ], "text/plain": [ " word count\n", "count 2277 2277.0\n", "unique 1062 1.0\n", "top и 1.0\n", "freq 91 2277.0" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " data=[df_arr[:], np.ones(df_arr.size)[:]]\n", " ).transpose().rename({0:'word', 1:'count'}, axis='columns')\n", "df.describe()" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
word
02
102
1001
132
141
\n", "
" ], "text/plain": [ " count\n", "word \n", "0 2\n", "10 2\n", "100 1\n", "13 2\n", "14 1" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_count = df.groupby('word').count()\n", "df_count.head()" ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
word
меня24
мне6
мной1
я69
\n", "
" ], "text/plain": [ " count\n", "word \n", "меня 24\n", "мне 6\n", "мной 1\n", "я 69" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ya = df[(df['word'] == \"я\") |\n", " (df['word'] == \"меня\") | \n", " (df['word'] == 'мне') |\n", " (df['word'] == 'мной')]\n", "ya.groupby('word').count()" ] }, { "cell_type": "code", "execution_count": 129, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
word
мы7
нам1
нас3
\n", "
" ], "text/plain": [ " count\n", "word \n", "мы 7\n", "нам 1\n", "нас 3" ] }, "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nas = df[(df['word'] == \"мы\") |\n", " (df['word'] == \"нам\") | \n", " (df['word'] == 'нас') |\n", " (df['word'] == 'нами')]\n", "nas.groupby('word').count()" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordвыменямнемноймынамнасононаонисвоисебятетехтыя
count1246171345535211169
\n", "
" ], "text/plain": [ "word вы меня мне мной мы нам нас он она они свои себя те тех \\\n", "count 1 24 6 1 7 1 3 4 5 5 3 5 2 1 \n", "\n", "word ты я \n", "count 11 69 " ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pronouns = df[(df['word'] == \"я\") |\n", " (df['word'] == \"меня\") | \n", " (df['word'] == 'мне') |\n", " (df['word'] == 'мной') |\n", " (df['word'] == \"мы\") |\n", " (df['word'] == \"нам\") | \n", " (df['word'] == 'нас') |\n", " (df['word'] == 'нами') |\n", " (df['word'] == \"ты\") |\n", " (df['word'] == \"они\") | \n", " (df['word'] == 'тех') |\n", " (df['word'] == 'те') |\n", " (df['word'] == 'она') |\n", " (df['word'] == 'он') |\n", " (df['word'] == 'оно') |\n", " (df['word'] == 'те') |\n", " (df['word'] == 'вы') |\n", " (df['word'] == 'себя') |\n", " (df['word'] == 'свои')]\n", "pronouns.groupby('word').count().transpose()" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 34\n", "dtype: int64" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "te = df[(df['word'] == \"ты\") |\n", " (df['word'] == \"они\") | \n", " (df['word'] == 'тех') |\n", " (df['word'] == 'те') |\n", " (df['word'] == 'она') |\n", " (df['word'] == 'он') |\n", " (df['word'] == 'оно') |\n", " (df['word'] == 'те') |\n", " (df['word'] == 'вы') |\n", " (df['word'] == 'себя')]\n", "te.groupby('word').count().sum()" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordciklumepamitransitionjunotietotietoenatorviber
count54114213
\n", "
" ], "text/plain": [ "word ciklum epam itransition juno tieto tietoenator viber\n", "count 5 4 1 14 2 1 3" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "companies = df[(df['word'] == \"viber\") |\n", " (df['word'] == \"juno\") | \n", " (df['word'] == 'epam') |\n", " (df['word'] == 'ciklum') |\n", " (df['word'] == 'tietoenator') |\n", " (df['word'] == 'tieto') |\n", " (df['word'] == 'itransition')]\n", "companies_count = companies.groupby('word').count()\n", "companies_count.transpose()" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "word 14\n", "count 14\n", "dtype: int64" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "companies[companies['word'] == 'juno'].count()" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "word 16\n", "count 16\n", "dtype: int64" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "companies[~(companies['word'] == 'juno')].count()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }