{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Preparation" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.180691Z", "start_time": "2020-09-30T23:15:48.111199Z" }, "pycharm": { "is_executing": true } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: pandas in /usr/lib64/python3.8/site-packages (0.25.3)\n", "Requirement already satisfied: numpy in /usr/lib64/python3.8/site-packages (1.18.4)\n", "Requirement already satisfied: sklearn in /home/pasha/.local/lib/python3.8/site-packages (0.0)\n", "Requirement already satisfied: nltk in /home/pasha/.local/lib/python3.8/site-packages (3.4.5)\n", "Requirement already satisfied: python-dateutil>=2.6.1 in /usr/lib/python3.8/site-packages (from pandas) (2.8.0)\n", "Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3.8/site-packages (from pandas) (2020.1)\n", "Requirement already satisfied: scikit-learn in /home/pasha/.local/lib/python3.8/site-packages (from sklearn) (0.22.2.post1)\n", "Requirement already satisfied: six in /usr/lib/python3.8/site-packages (from nltk) (1.14.0)\n", "Requirement already satisfied: joblib>=0.11 in /home/pasha/.local/lib/python3.8/site-packages (from scikit-learn->sklearn) (0.14.1)\n", "Requirement already satisfied: scipy>=0.17.0 in /usr/lib64/python3.8/site-packages (from scikit-learn->sklearn) (1.4.1)\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip install --user pandas numpy sklearn nltk" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.728298Z", "start_time": "2020-09-30T23:15:49.185540Z" }, "pycharm": { "is_executing": true } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.732519Z", "start_time": "2020-09-30T23:15:49.730219Z" }, "pycharm": { "is_executing": true } }, "outputs": [], "source": [ "# Define constants\n", "tfidf = TfidfVectorizer(stop_words='english')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.747272Z", "start_time": "2020-09-30T23:15:49.734333Z" } }, "outputs": [], "source": [ "# By https://ru.stackoverflow.com/questions/995616/Как-сделать-tf-idf-для-русских-текстов\n", "\n", "#import nltk\n", "#from nltk.corpus import stopwords as nltk_stopwords\n", "\n", "#nltk.download('stopwords')\n", "#stopwords = set(nltk_stopwords.words('russian') )\n", "#tfidf = TfidfVectorizer(stop_words=stopwords)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.766942Z", "start_time": "2020-09-30T23:15:49.748416Z" }, "pycharm": { "is_executing": true }, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Article</th>\n", " </tr>\n", " <tr>\n", " <th>№</th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>Раз, два, три, четыре</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>вышел зайчик погулять</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>вдруг охотник выбегает</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>прямо в зайчик стреляет</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>пуляет прямо в зайчик</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Article\n", "№ \n", "1 Раз, два, три, четыре\n", "2 вышел зайчик погулять\n", "3 вдруг охотник выбегает\n", "4 прямо в зайчик стреляет\n", "5 пуляет прямо в зайчик" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Minimal example:\n", "articles = pd.read_csv('data/TF-IDF/TF-IDF-min.csv.Ru', index_col='№')\n", "# articles = pd.read_csv('data/TF-IDF/TF-IDF-min.csv.En', index_col='№')\n", "# Full real file:\n", "# articles = pd.read_csv('RDC-135_articles_golden_set_mapping.csv', index_col='№')\n", "articles" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.781595Z", "start_time": "2020-09-30T23:15:49.768242Z" }, "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "array([[0. , 0. , 0. , 0.5 , 0. ,\n", " 0. , 0. , 0. , 0. , 0.5 ,\n", " 0. , 0.5 , 0.5 ],\n", " [0. , 0. , 0.63907044, 0. , 0.42799292,\n", " 0. , 0.63907044, 0. , 0. , 0. ,\n", " 0. , 0. , 0. ],\n", " [0.57735027, 0.57735027, 0. , 0. , 0. ,\n", " 0.57735027, 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. ],\n", " [0. , 0. , 0. , 0. , 0.4622077 ,\n", " 0. , 0. , 0.55681615, 0. , 0. ,\n", " 0.69015927, 0. , 0. ],\n", " [0. , 0. , 0. , 0. , 0.4622077 ,\n", " 0. , 0. , 0.55681615, 0.69015927, 0. ,\n", " 0. , 0. , 0. ]])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf_matrix = tfidf.fit_transform(articles['Article'])\n", "tfidf_matrix.toarray() # or res.todense()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.790933Z", "start_time": "2020-09-30T23:15:49.782853Z" } }, "outputs": [ { "data": { "text/plain": [ "(5, 13)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf_matrix.shape" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.804071Z", "start_time": "2020-09-30T23:15:49.793459Z" } }, "outputs": [ { "data": { "text/plain": [ "'english'" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf.stop_words" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Вычисление\n", "\n", "## Самые популярные слова" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.813496Z", "start_time": "2020-09-30T23:15:49.805692Z" } }, "outputs": [ { "data": { "text/plain": [ "['вдруг',\n", " 'выбегает',\n", " 'вышел',\n", " 'два',\n", " 'зайчик',\n", " 'охотник',\n", " 'погулять',\n", " 'прямо',\n", " 'пуляет',\n", " 'раз',\n", " 'стреляет',\n", " 'три',\n", " 'четыре']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# By https://www.rupython.com/sklearn-tfidf-transformer-tf-idf-33655.html\n", "# Вышеупомянутый X имеет значения TF-IDF всех документов в корпусе. Это большая разреженная матрица.\n", "# Теперь,\n", "tfidf.get_feature_names()\n", "# это дает вам список всех токенов или n-граммов или слов. Для первого документа в вашем корпусе, " ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.823154Z", "start_time": "2020-09-30T23:15:49.814805Z" } }, "outputs": [ { "data": { "text/plain": [ "{'раз': 9,\n", " 'два': 3,\n", " 'три': 11,\n", " 'четыре': 12,\n", " 'вышел': 2,\n", " 'зайчик': 4,\n", " 'погулять': 6,\n", " 'вдруг': 0,\n", " 'охотник': 5,\n", " 'выбегает': 1,\n", " 'прямо': 7,\n", " 'стреляет': 10,\n", " 'пуляет': 8}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# By https://stackoverflow.com/questions/37593293/what-is-the-simplest-way-to-get-tfidf-with-pandas-dataframe#comment72191707_37593408\n", "# v.get_feature_names() will give you the list of feature names.\n", "# v.vocabulary_ will give you a dict with feature names as keys and their index in the matrix produced as values.\n", "tfidf.vocabulary_" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.837422Z", "start_time": "2020-09-30T23:15:49.824270Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "### doc 0: ###\n", "четыре 0.5\n", "три 0.5\n", "два 0.5\n", "раз 0.5\n", "### doc 1: ###\n", "погулять 0.6390704413963749\n", "зайчик 0.42799292268317357\n", "вышел 0.6390704413963749\n", "### doc 2: ###\n", "выбегает 0.5773502691896258\n", "охотник 0.5773502691896258\n", "вдруг 0.5773502691896258\n", "### doc 3: ###\n", "стреляет 0.6901592662889633\n", "прямо 0.5568161504458247\n", "зайчик 0.46220770413113277\n", "### doc 4: ###\n", "пуляет 0.6901592662889633\n", "прямо 0.5568161504458247\n", "зайчик 0.46220770413113277\n" ] } ], "source": [ "# By https://www.rupython.com/sklearn-tfidf-transformer-tf-idf-33655.html\n", "# это дает вам список всех токенов или n-граммов или слов. Для первого документа в вашем корпусе, \n", "# Позволяет распечатать их:\n", "def print_word_ratings_by_document(doc=0):\n", " feature_names = tfidf.get_feature_names()\n", " feature_index = tfidf_matrix[doc,:].nonzero()[1]\n", " tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index]) \n", " for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:\n", " print (w, s)\n", "\n", "for i, _ in articles.iterrows():\n", " print(\"### doc {}: ###\".format(i-1))\n", " print_word_ratings_by_document(i-1)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.854958Z", "start_time": "2020-09-30T23:15:49.838858Z" } }, "outputs": [ { "data": { "text/plain": [ "['зайчик',\n", " 'прямо',\n", " 'стреляет',\n", " 'пуляет',\n", " 'погулять',\n", " 'вышел',\n", " 'охотник',\n", " 'выбегает',\n", " 'вдруг',\n", " 'четыре']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# https://ru.stackoverflow.com/questions/772859/tfidfvectorizer/773018#773018\n", "# Топ 10 самых популярных слов:\n", "N=10\n", "idx = np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[::-1][:N]\n", "top_10_words = np.array(tfidf.get_feature_names())[idx].tolist()\n", "top_10_words" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Разбор" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.866125Z", "start_time": "2020-09-30T23:15:49.856417Z" } }, "outputs": [ { "data": { "text/plain": [ "(5, 13)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf_matrix.shape" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.876690Z", "start_time": "2020-09-30T23:15:49.867221Z" } }, "outputs": [ { "data": { "text/plain": [ "matrix([[0. , 0. , 0. , 0.5 , 0. ,\n", " 0. , 0. , 0. , 0. , 0.5 ,\n", " 0. , 0.5 , 0.5 ],\n", " [0. , 0. , 0.63907044, 0. , 0.42799292,\n", " 0. , 0.63907044, 0. , 0. , 0. ,\n", " 0. , 0. , 0. ],\n", " [0.57735027, 0.57735027, 0. , 0. , 0. ,\n", " 0.57735027, 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. ],\n", " [0. , 0. , 0. , 0. , 0.4622077 ,\n", " 0. , 0. , 0.55681615, 0. , 0. ,\n", " 0.69015927, 0. , 0. ],\n", " [0. , 0. , 0. , 0. , 0.4622077 ,\n", " 0. , 0. , 0.55681615, 0.69015927, 0. ,\n", " 0. , 0. , 0. ]])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf_matrix.todense()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.885867Z", "start_time": "2020-09-30T23:15:49.878116Z" } }, "outputs": [ { "data": { "text/plain": [ "matrix([[0.57735027, 0.57735027, 0.63907044, 0.5 , 1.35240833,\n", " 0.57735027, 0.63907044, 1.1136323 , 0.69015927, 0.5 ,\n", " 0.69015927, 0.5 , 0.5 ]])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.sum.html#numpy.sum\n", "tfidf_matrix.sum(axis=0) # Сумма по столбцам" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.896395Z", "start_time": "2020-09-30T23:15:49.887149Z" } }, "outputs": [ { "data": { "text/plain": [ "matrix([[ 3, 9, 11, 12, 0, 1, 5, 2, 6, 8, 10, 7, 4]])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html\n", "tfidf_matrix.sum(axis=0).argsort(axis=1) # Возвращает *индексы*, по возрастанию значений элементов" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.910697Z", "start_time": "2020-09-30T23:15:49.898006Z" }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([ 3, 9, 11, 12, 0, 1, 5, 2, 6, 8, 10, 7, 4])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.ravel.html\n", "np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.923258Z", "start_time": "2020-09-30T23:15:49.911896Z" }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([ 4, 7, 10, 8, 6, 2, 5, 1, 0, 12, 11, 9, 3])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[::-1] # Reverse list (DESC)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.933466Z", "start_time": "2020-09-30T23:15:49.924779Z" }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([ 4, 7, 10, 8, 6, 2, 5, 1, 0, 12])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[::-1][:N] # Take top N elements" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.947141Z", "start_time": "2020-09-30T23:15:49.934612Z" } }, "outputs": [ { "data": { "text/plain": [ "array([ 4, 7, 10, 8, 6, 2, 5, 1, 0, 12])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[-N:][::-1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Поиск самого похожего документа" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.966716Z", "start_time": "2020-09-30T23:15:49.948679Z" }, "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "array([0. , 0.19782163, 0. , 0.52368019])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "similarities = (tfidf_matrix * tfidf_matrix.T).A[-1,:-1]\n", "similarities" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:49.976618Z", "start_time": "2020-09-30T23:15:49.968456Z" } }, "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html\n", "# numpy.argmax(a, axis=None, out=None)\n", "# Returns the indices of the maximum values along an axis.\n", "max_sim_position = np.argmax(similarities)\n", "max_sim_position" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.006256Z", "start_time": "2020-09-30T23:15:49.981559Z" }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "0.52368018715548" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "max_sim = max(similarities)\n", "max_sim" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.016294Z", "start_time": "2020-09-30T23:15:50.008281Z" } }, "outputs": [ { "data": { "text/plain": [ "0.52368018715548" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "similarities[max_sim_position]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Разбор" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.030833Z", "start_time": "2020-09-30T23:15:50.017817Z" }, "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "matrix([[0. , 0. , 0. , 0.5 , 0. ,\n", " 0. , 0. , 0. , 0. , 0.5 ,\n", " 0. , 0.5 , 0.5 ],\n", " [0. , 0. , 0.63907044, 0. , 0.42799292,\n", " 0. , 0.63907044, 0. , 0. , 0. ,\n", " 0. , 0. , 0. ],\n", " [0.57735027, 0.57735027, 0. , 0. , 0. ,\n", " 0.57735027, 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. ],\n", " [0. , 0. , 0. , 0. , 0.4622077 ,\n", " 0. , 0. , 0.55681615, 0. , 0. ,\n", " 0.69015927, 0. , 0. ],\n", " [0. , 0. , 0. , 0. , 0.4622077 ,\n", " 0. , 0. , 0.55681615, 0.69015927, 0. ,\n", " 0. , 0. , 0. ]])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf_matrix.todense()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.045030Z", "start_time": "2020-09-30T23:15:50.031994Z" } }, "outputs": [ { "data": { "text/plain": [ "(5, 13)" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf_matrix.shape" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.053877Z", "start_time": "2020-09-30T23:15:50.046328Z" } }, "outputs": [ { "data": { "text/plain": [ "scipy.sparse.csr.csr_matrix" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(tfidf_matrix)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:17:03.579717Z", "start_time": "2020-09-30T23:17:03.569840Z" } }, "outputs": [ { "data": { "text/plain": [ "matrix([[1. , 0. , 0. , 0. , 0. ],\n", " [0. , 1. , 0. , 0.19782163, 0.19782163],\n", " [0. , 0. , 1. , 0. , 0. ],\n", " [0. , 0.19782163, 0. , 1. , 0.52368019],\n", " [0. , 0.19782163, 0. , 0.52368019, 1. ]])" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mul = tfidf_matrix * tfidf_matrix.T\n", "mul.todense()" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:17:20.830796Z", "start_time": "2020-09-30T23:17:20.821745Z" }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "(5, 5)" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mul.todense().shape" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:17:24.208235Z", "start_time": "2020-09-30T23:17:24.199391Z" } }, "outputs": [ { "data": { "text/plain": [ "array([[1. , 0. , 0. , 0. , 0. ],\n", " [0. , 1. , 0. , 0.19782163, 0.19782163],\n", " [0. , 0. , 1. , 0. , 0. ],\n", " [0. , 0.19782163, 0. , 1. , 0.52368019],\n", " [0. , 0.19782163, 0. , 0.52368019, 1. ]])" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores = mul.A\n", "scores" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.103417Z", "start_time": "2020-09-30T23:15:50.094348Z" } }, "outputs": [ { "data": { "text/plain": [ "array([[-1. , 0. , 0. , 0. , 0. ],\n", " [ 0. , -1. , 0. , 0.19782163, 0.19782163],\n", " [ 0. , 0. , -1. , 0. , 0. ],\n", " [ 0. , 0.19782163, 0. , -1. , 0.52368019],\n", " [ 0. , 0.19782163, 0. , 0.52368019, -1. ]])" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We will search maximum, so do not willing match to himself:\n", "np.fill_diagonal(scores, -1)\n", "scores" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.116041Z", "start_time": "2020-09-30T23:15:50.104622Z" } }, "outputs": [ { "data": { "text/plain": [ "numpy.ndarray" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(scores)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.130614Z", "start_time": "2020-09-30T23:15:50.117201Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " <th>1</th>\n", " <th>2</th>\n", " <th>3</th>\n", " <th>4</th>\n", " </tr>\n", " <tr>\n", " <th>№</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>-1.0</td>\n", " <td>0.000000</td>\n", " <td>0.0</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0.0</td>\n", " <td>-1.000000</td>\n", " <td>0.0</td>\n", " <td>0.197822</td>\n", " <td>0.197822</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0.0</td>\n", " <td>0.000000</td>\n", " <td>-1.0</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0.0</td>\n", " <td>0.197822</td>\n", " <td>0.0</td>\n", " <td>-1.000000</td>\n", " <td>0.523680</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>0.0</td>\n", " <td>0.197822</td>\n", " <td>0.0</td>\n", " <td>0.523680</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 0 1 2 3 4\n", "№ \n", "1 -1.0 0.000000 0.0 0.000000 0.000000\n", "2 0.0 -1.000000 0.0 0.197822 0.197822\n", "3 0.0 0.000000 -1.0 0.000000 0.000000\n", "4 0.0 0.197822 0.0 -1.000000 0.523680\n", "5 0.0 0.197822 0.0 0.523680 -1.000000" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores_df = pd.DataFrame(scores, index=articles.index)\n", "scores_df" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:18:10.999453Z", "start_time": "2020-09-30T23:18:10.986455Z" } }, "outputs": [ { "data": { "text/plain": [ "matrix([[1. , 0. , 0. , 0. , 0. ],\n", " [0. , 1. , 0. , 0.19782163, 0.19782163],\n", " [0. , 0. , 1. , 0. , 0. ],\n", " [0. , 0.19782163, 0. , 1. , 0.52368019],\n", " [0. , 0.19782163, 0. , 0.52368019, 1. ]])" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# By https://gist.github.com/RZachLamberty/1ed47cd0e2d0d968f7cdbd3d53a50f4c\n", "# you can calculate cosine similarity easily given this\n", "(tfidf_matrix @ tfidf_matrix.T).todense()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:17:57.242549Z", "start_time": "2020-09-30T23:17:57.233091Z" }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([[1. , 0. , 0. , 0. , 0. ],\n", " [0. , 1. , 0. , 0.19782163, 0.19782163],\n", " [0. , 0. , 1. , 0. , 0. ],\n", " [0. , 0.19782163, 0. , 1. , 0.52368019],\n", " [0. , 0.19782163, 0. , 0.52368019, 1. ]])" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.A.html\n", "# Return self as an ndarray object.\n", "# Equivalent to np.asarray(self)\n", "mul.A" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:18:00.735140Z", "start_time": "2020-09-30T23:18:00.725172Z" }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([0. , 0.19782163, 0. , 0.52368019])" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mul.A[-1,:-1] # Последняя строка" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.169619Z", "start_time": "2020-09-30T23:15:50.162056Z" } }, "outputs": [ { "data": { "text/plain": [ "array([0. , 0.19782163, 0. , 0.52368019, 1. ])" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mul.A[-1,0:] # Последняя строка" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.179627Z", "start_time": "2020-09-30T23:15:50.171112Z" }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([0. , 0.19782163, 0. , 0.52368019])" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mul.A[-1,:-1] # Последняя строка без последнего элемента.\n", "# Матрица диагональная (остальные не имеют значения, зеркально повторяются), содержит веса всех со всеми.\n", "# Последний элемент, выкидывается потому что это матч самого к себе, если мы рассматриваем максимальный дубль\n", "# для последнего в наборе документа (так было в функции get_similarities из GOJI)\n", "\n", "# Получается что реально нужно выкидывать по индексу того, для которого ищется сравнение!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Обобщение варианта - без пересчёта матрицы каждый раз!" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.187729Z", "start_time": "2020-09-30T23:15:50.181109Z" } }, "outputs": [], "source": [ "# Получается что реально нужно выкидывать по индексу того, для которого ищется сравнение!\n", "forDocNo=1 # Expect match doc 3<>4" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.198409Z", "start_time": "2020-09-30T23:15:50.188883Z" } }, "outputs": [ { "data": { "text/plain": [ "array([[-1. , 0. , 0. , 0. , 0. ],\n", " [ 0. , -1. , 0. , 0.19782163, 0.19782163],\n", " [ 0. , 0. , -1. , 0. , 0. ],\n", " [ 0. , 0.19782163, 0. , -1. , 0.52368019],\n", " [ 0. , 0.19782163, 0. , 0.52368019, -1. ]])" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.207749Z", "start_time": "2020-09-30T23:15:50.199654Z" } }, "outputs": [ { "data": { "text/plain": [ "(3, 0.19782162617776308)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = scores[forDocNo]\n", "max_sim_position = np.argmax(s)\n", "\n", "(max_sim_position, s[max_sim_position])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Для каждого документа, в DataFrame add \"Max TF/IDF DUP score\" and \"Max DUP score docId\"" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.222234Z", "start_time": "2020-09-30T23:15:50.208989Z" }, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Article</th>\n", " </tr>\n", " <tr>\n", " <th>№</th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>Раз, два, три, четыре</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>вышел зайчик погулять</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>вдруг охотник выбегает</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>прямо в зайчик стреляет</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>пуляет прямо в зайчик</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Article\n", "№ \n", "1 Раз, два, три, четыре\n", "2 вышел зайчик погулять\n", "3 вдруг охотник выбегает\n", "4 прямо в зайчик стреляет\n", "5 пуляет прямо в зайчик" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "articles\n", "\n", "# Should be matched:\n", "# id № Dup№\n", "# 0 1 -\n", "# 1 2 (4 and 5)\n", "# 2 3 -\n", "# 3 4 5\n", "# 4 5 4" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.234915Z", "start_time": "2020-09-30T23:15:50.223347Z" }, "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "array([[-1. , 0. , 0. , 0. , 0. ],\n", " [ 0. , -1. , 0. , 0.19782163, 0.19782163],\n", " [ 0. , 0. , -1. , 0. , 0. ],\n", " [ 0. , 0.19782163, 0. , -1. , 0.52368019],\n", " [ 0. , 0.19782163, 0. , 0.52368019, -1. ]])" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.253892Z", "start_time": "2020-09-30T23:15:50.236266Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " <th>1</th>\n", " <th>2</th>\n", " <th>3</th>\n", " <th>4</th>\n", " </tr>\n", " <tr>\n", " <th>№</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>-1.0</td>\n", " <td>0.000000</td>\n", " <td>0.0</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0.0</td>\n", " <td>-1.000000</td>\n", " <td>0.0</td>\n", " <td>0.197822</td>\n", " <td>0.197822</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0.0</td>\n", " <td>0.000000</td>\n", " <td>-1.0</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0.0</td>\n", " <td>0.197822</td>\n", " <td>0.0</td>\n", " <td>-1.000000</td>\n", " <td>0.523680</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>0.0</td>\n", " <td>0.197822</td>\n", " <td>0.0</td>\n", " <td>0.523680</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 0 1 2 3 4\n", "№ \n", "1 -1.0 0.000000 0.0 0.000000 0.000000\n", "2 0.0 -1.000000 0.0 0.197822 0.197822\n", "3 0.0 0.000000 -1.0 0.000000 0.000000\n", "4 0.0 0.197822 0.0 -1.000000 0.523680\n", "5 0.0 0.197822 0.0 0.523680 -1.000000" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores_df" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.264727Z", "start_time": "2020-09-30T23:15:50.255609Z" } }, "outputs": [ { "data": { "text/plain": [ "0 0.0\n", "1 0.0\n", "2 -1.0\n", "3 0.0\n", "4 0.0\n", "Name: 3, dtype: float64" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores_df.loc[3]" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.275188Z", "start_time": "2020-09-30T23:15:50.266014Z" } }, "outputs": [ { "data": { "text/plain": [ "0 0.000000\n", "1 0.197822\n", "2 0.000000\n", "3 -1.000000\n", "4 0.523680\n", "Name: 4, dtype: float64" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores_df.iloc[3]" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.299798Z", "start_time": "2020-09-30T23:15:50.276791Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Article</th>\n", " <th>Max DUP score docId</th>\n", " <th>Max TF/IDF DUP score</th>\n", " </tr>\n", " <tr>\n", " <th>№</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>Раз, два, три, четыре</td>\n", " <td>-1</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>вышел зайчик погулять</td>\n", " <td>4</td>\n", " <td>0.197822</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>вдруг охотник выбегает</td>\n", " <td>-1</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>прямо в зайчик стреляет</td>\n", " <td>5</td>\n", " <td>0.523680</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>пуляет прямо в зайчик</td>\n", " <td>4</td>\n", " <td>0.523680</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Article Max DUP score docId Max TF/IDF DUP score\n", "№ \n", "1 Раз, два, три, четыре -1 0.000000\n", "2 вышел зайчик погулять 4 0.197822\n", "3 вдруг охотник выбегает -1 0.000000\n", "4 прямо в зайчик стреляет 5 0.523680\n", "5 пуляет прямо в зайчик 4 0.523680" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# By https://stackoverflow.com/questions/26658240/getting-the-index-of-a-row-in-a-pandas-apply-function/48819898#48819898\n", "# index available as row.name\n", "def most_similar(row):\n", "# print('====')\n", "# print('row.name={}; scores_df.loc[row.name].idxmax()={}; scores_df.iloc[scores_df.loc[row.name].idxmax()].name={}'.format(\n", "# row.name\n", "# ,scores_df.loc[row.name].idxmax()\n", "# ,scores_df.iloc[scores_df.loc[row.name].idxmax()].name\n", "# )\n", "# )\n", "# print('row.loc={}; row.iloc={}'.format(row.loc, row.iloc))\n", " max_similar_doc = scores_df.iloc[scores_df.loc[row.name].idxmax()].name # Array index (.iloc) into DataFrame index (.name)\n", " max_similar_score = scores_df.loc[row.name].max()\n", " return ((max_similar_doc if max_similar_score > 0 else -1), max_similar_score)\n", "\n", "# articles['Max DUP score docId'] = articles.apply(lambda i: np.argmax(scores), axis=1)\n", "# articles['Max DUP score docId'] = articles.apply(lambda i: type(i.index), axis=1)\n", "articles[['Max DUP score docId', 'Max TF/IDF DUP score']] = articles.apply(most_similar, axis=1, result_type='expand')\n", "articles['Max DUP score docId'] = articles['Max DUP score docId'].astype('int32')\n", "articles" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.308302Z", "start_time": "2020-09-30T23:15:50.301149Z" } }, "outputs": [ { "data": { "text/plain": [ "(№\n", " 1 0.000000\n", " 2 0.197822\n", " 3 0.000000\n", " 4 -1.000000\n", " 5 0.523680\n", " Name: 3, dtype: float64,\n", " 0 0.0\n", " 1 0.0\n", " 2 -1.0\n", " 3 0.0\n", " 4 0.0\n", " Name: 3, dtype: float64)" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "( scores_df[3], scores_df.loc[3] )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Python constructions\n", "## Apply with 2 columns at once\n", "\n", "By https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns/52363890#52363890\n", "+By https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns/52363890#comment106834440_16242202 for column naming" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.319263Z", "start_time": "2020-09-30T23:15:50.309733Z" } }, "outputs": [], "source": [ "# articles[['a', 'b']] = articles.apply(lambda i: [1, 2], axis=1, result_type='expand')\n", "# articles" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "ExecuteTime": { "end_time": "2020-09-30T23:15:50.443806Z", "start_time": "2020-09-30T23:15:50.320624Z" }, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<class 'pandas.core.series.Series'>\n", "<class 'pandas.core.series.Series'>\n" ] }, { "ename": "KeyError", "evalue": "('File name', 'occurred at index 1')", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, series, key)\u001b[0m\n\u001b[1;32m 4735\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4736\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mlibindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value_box\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4737\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.get_value_box\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.get_value_at\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/util.pxd\u001b[0m in \u001b[0;36mpandas._libs.util.get_value_at\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/util.pxd\u001b[0m in \u001b[0;36mpandas._libs.util.validate_indexer\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mTypeError\u001b[0m: 'str' object cannot be interpreted as an integer", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-51-38f66aea0c91>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Article'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0marticles\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdump_xml_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)\u001b[0m\n\u001b[1;32m 6926\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6927\u001b[0m )\n\u001b[0;32m-> 6928\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6929\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6930\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapplymap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mget_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_raw\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 186\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 187\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 188\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_empty_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_standard\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 290\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[0;31m# compute the result using the series generator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 292\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_series_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 293\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 294\u001b[0m \u001b[0;31m# wrap results\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_series_generator\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 319\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 320\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 321\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 322\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m<ipython-input-51-38f66aea0c91>\u001b[0m in \u001b[0;36mdump_xml_file\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdump_xml_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'File name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'articles/'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'File name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.xml'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w+'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Article'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1069\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_if_callable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1070\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1071\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1072\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1073\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, series, key)\u001b[0m\n\u001b[1;32m 4742\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mInvalidIndexError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4743\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4744\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4745\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# pragma: no cover\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4746\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, series, key)\u001b[0m\n\u001b[1;32m 4728\u001b[0m \u001b[0mk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_convert_scalar_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"getitem\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4729\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4730\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtz\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"tz\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4731\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4732\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mholds_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_boolean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_value\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_value\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: ('File name', 'occurred at index 1')" ] } ], "source": [ "def dump_xml_file(row):\n", " print(type(row))\n", " print(row['File name'])\n", " with open('articles/' + row['File name'].replace('.xml', '.txt'), 'w+') as file:\n", " file.write(row['Article'])\n", "\n", "articles.apply(dump_xml_file, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Docs and links\n", "\n", "* [sklearn: TFIDF Transformer: Как получить значения tf-idf данных слов в документе](https://www.rupython.com/sklearn-tfidf-transformer-tf-idf-33655.html)\n", "* [SO question: Имеется текст, надо вычислить TF-IDF-признаки по имеющимся тексту. Нашел 10 минимальных весов. Требуется найти 10 слов соответствующих абсолютному значению весов. Как можно это сделать?введите сюда описание изображения](https://ru.stackoverflow.com/questions/772859/tfidfvectorizer/773018#773018). С разбором что к чему." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" }, "pycharm": { "stem_cell": { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [] } }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "293.217px" }, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 2 }