{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.180691Z",
     "start_time": "2020-09-30T23:15:48.111199Z"
    },
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pandas in /usr/lib64/python3.8/site-packages (0.25.3)\n",
      "Requirement already satisfied: numpy in /usr/lib64/python3.8/site-packages (1.18.4)\n",
      "Requirement already satisfied: sklearn in /home/pasha/.local/lib/python3.8/site-packages (0.0)\n",
      "Requirement already satisfied: nltk in /home/pasha/.local/lib/python3.8/site-packages (3.4.5)\n",
      "Requirement already satisfied: python-dateutil>=2.6.1 in /usr/lib/python3.8/site-packages (from pandas) (2.8.0)\n",
      "Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3.8/site-packages (from pandas) (2020.1)\n",
      "Requirement already satisfied: scikit-learn in /home/pasha/.local/lib/python3.8/site-packages (from sklearn) (0.22.2.post1)\n",
      "Requirement already satisfied: six in /usr/lib/python3.8/site-packages (from nltk) (1.14.0)\n",
      "Requirement already satisfied: joblib>=0.11 in /home/pasha/.local/lib/python3.8/site-packages (from scikit-learn->sklearn) (0.14.1)\n",
      "Requirement already satisfied: scipy>=0.17.0 in /usr/lib64/python3.8/site-packages (from scikit-learn->sklearn) (1.4.1)\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "!{sys.executable} -m pip install --user pandas numpy sklearn nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.728298Z",
     "start_time": "2020-09-30T23:15:49.185540Z"
    },
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.732519Z",
     "start_time": "2020-09-30T23:15:49.730219Z"
    },
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [],
   "source": [
    "# Define constants\n",
    "tfidf = TfidfVectorizer(stop_words='english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.747272Z",
     "start_time": "2020-09-30T23:15:49.734333Z"
    }
   },
   "outputs": [],
   "source": [
    "# By https://ru.stackoverflow.com/questions/995616/Как-сделать-tf-idf-для-русских-текстов\n",
    "\n",
    "#import nltk\n",
    "#from nltk.corpus import stopwords as nltk_stopwords\n",
    "\n",
    "#nltk.download('stopwords')\n",
    "#stopwords = set(nltk_stopwords.words('russian') )\n",
    "#tfidf = TfidfVectorizer(stop_words=stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.766942Z",
     "start_time": "2020-09-30T23:15:49.748416Z"
    },
    "pycharm": {
     "is_executing": true
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Article</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>№</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Раз, два, три, четыре</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>вышел зайчик погулять</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>вдруг охотник выбегает</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>прямо в зайчик стреляет</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>пуляет прямо в зайчик</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   Article\n",
       "№                         \n",
       "1    Раз, два, три, четыре\n",
       "2    вышел зайчик погулять\n",
       "3   вдруг охотник выбегает\n",
       "4  прямо в зайчик стреляет\n",
       "5    пуляет прямо в зайчик"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Minimal example:\n",
    "articles = pd.read_csv('data/TF-IDF/TF-IDF-min.csv.Ru', index_col='№')\n",
    "# articles = pd.read_csv('data/TF-IDF/TF-IDF-min.csv.En', index_col='№')\n",
    "# Full real file:\n",
    "# articles = pd.read_csv('RDC-135_articles_golden_set_mapping.csv', index_col='№')\n",
    "articles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.781595Z",
     "start_time": "2020-09-30T23:15:49.768242Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.        , 0.        , 0.        , 0.5       , 0.        ,\n",
       "        0.        , 0.        , 0.        , 0.        , 0.5       ,\n",
       "        0.        , 0.5       , 0.5       ],\n",
       "       [0.        , 0.        , 0.63907044, 0.        , 0.42799292,\n",
       "        0.        , 0.63907044, 0.        , 0.        , 0.        ,\n",
       "        0.        , 0.        , 0.        ],\n",
       "       [0.57735027, 0.57735027, 0.        , 0.        , 0.        ,\n",
       "        0.57735027, 0.        , 0.        , 0.        , 0.        ,\n",
       "        0.        , 0.        , 0.        ],\n",
       "       [0.        , 0.        , 0.        , 0.        , 0.4622077 ,\n",
       "        0.        , 0.        , 0.55681615, 0.        , 0.        ,\n",
       "        0.69015927, 0.        , 0.        ],\n",
       "       [0.        , 0.        , 0.        , 0.        , 0.4622077 ,\n",
       "        0.        , 0.        , 0.55681615, 0.69015927, 0.        ,\n",
       "        0.        , 0.        , 0.        ]])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf_matrix = tfidf.fit_transform(articles['Article'])\n",
    "tfidf_matrix.toarray() # or res.todense()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.790933Z",
     "start_time": "2020-09-30T23:15:49.782853Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5, 13)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf_matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.804071Z",
     "start_time": "2020-09-30T23:15:49.793459Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'english'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf.stop_words"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Вычисление\n",
    "\n",
    "## Самые популярные слова"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.813496Z",
     "start_time": "2020-09-30T23:15:49.805692Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['вдруг',\n",
       " 'выбегает',\n",
       " 'вышел',\n",
       " 'два',\n",
       " 'зайчик',\n",
       " 'охотник',\n",
       " 'погулять',\n",
       " 'прямо',\n",
       " 'пуляет',\n",
       " 'раз',\n",
       " 'стреляет',\n",
       " 'три',\n",
       " 'четыре']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# By https://www.rupython.com/sklearn-tfidf-transformer-tf-idf-33655.html\n",
    "# Вышеупомянутый X имеет значения TF-IDF всех документов в корпусе. Это большая разреженная матрица.\n",
    "# Теперь,\n",
    "tfidf.get_feature_names()\n",
    "# это дает вам список всех токенов или n-граммов или слов. Для первого документа в вашем корпусе, "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.823154Z",
     "start_time": "2020-09-30T23:15:49.814805Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'раз': 9,\n",
       " 'два': 3,\n",
       " 'три': 11,\n",
       " 'четыре': 12,\n",
       " 'вышел': 2,\n",
       " 'зайчик': 4,\n",
       " 'погулять': 6,\n",
       " 'вдруг': 0,\n",
       " 'охотник': 5,\n",
       " 'выбегает': 1,\n",
       " 'прямо': 7,\n",
       " 'стреляет': 10,\n",
       " 'пуляет': 8}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# By https://stackoverflow.com/questions/37593293/what-is-the-simplest-way-to-get-tfidf-with-pandas-dataframe#comment72191707_37593408\n",
    "# v.get_feature_names() will give you the list of feature names.\n",
    "# v.vocabulary_ will give you a dict with feature names as keys and their index in the matrix produced as values.\n",
    "tfidf.vocabulary_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.837422Z",
     "start_time": "2020-09-30T23:15:49.824270Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "### doc 0: ###\n",
      "четыре 0.5\n",
      "три 0.5\n",
      "два 0.5\n",
      "раз 0.5\n",
      "### doc 1: ###\n",
      "погулять 0.6390704413963749\n",
      "зайчик 0.42799292268317357\n",
      "вышел 0.6390704413963749\n",
      "### doc 2: ###\n",
      "выбегает 0.5773502691896258\n",
      "охотник 0.5773502691896258\n",
      "вдруг 0.5773502691896258\n",
      "### doc 3: ###\n",
      "стреляет 0.6901592662889633\n",
      "прямо 0.5568161504458247\n",
      "зайчик 0.46220770413113277\n",
      "### doc 4: ###\n",
      "пуляет 0.6901592662889633\n",
      "прямо 0.5568161504458247\n",
      "зайчик 0.46220770413113277\n"
     ]
    }
   ],
   "source": [
    "# By https://www.rupython.com/sklearn-tfidf-transformer-tf-idf-33655.html\n",
    "# это дает вам список всех токенов или n-граммов или слов. Для первого документа в вашем корпусе, \n",
    "# Позволяет распечатать их:\n",
    "def print_word_ratings_by_document(doc=0):\n",
    "    feature_names = tfidf.get_feature_names()\n",
    "    feature_index = tfidf_matrix[doc,:].nonzero()[1]\n",
    "    tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index]) \n",
    "    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:\n",
    "        print (w, s)\n",
    "\n",
    "for i, _ in articles.iterrows():\n",
    "    print(\"### doc {}: ###\".format(i-1))\n",
    "    print_word_ratings_by_document(i-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.854958Z",
     "start_time": "2020-09-30T23:15:49.838858Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['зайчик',\n",
       " 'прямо',\n",
       " 'стреляет',\n",
       " 'пуляет',\n",
       " 'погулять',\n",
       " 'вышел',\n",
       " 'охотник',\n",
       " 'выбегает',\n",
       " 'вдруг',\n",
       " 'четыре']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# https://ru.stackoverflow.com/questions/772859/tfidfvectorizer/773018#773018\n",
    "# Топ 10 самых популярных слов:\n",
    "N=10\n",
    "idx = np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[::-1][:N]\n",
    "top_10_words = np.array(tfidf.get_feature_names())[idx].tolist()\n",
    "top_10_words"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Разбор"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.866125Z",
     "start_time": "2020-09-30T23:15:49.856417Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5, 13)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf_matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.876690Z",
     "start_time": "2020-09-30T23:15:49.867221Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[0.        , 0.        , 0.        , 0.5       , 0.        ,\n",
       "         0.        , 0.        , 0.        , 0.        , 0.5       ,\n",
       "         0.        , 0.5       , 0.5       ],\n",
       "        [0.        , 0.        , 0.63907044, 0.        , 0.42799292,\n",
       "         0.        , 0.63907044, 0.        , 0.        , 0.        ,\n",
       "         0.        , 0.        , 0.        ],\n",
       "        [0.57735027, 0.57735027, 0.        , 0.        , 0.        ,\n",
       "         0.57735027, 0.        , 0.        , 0.        , 0.        ,\n",
       "         0.        , 0.        , 0.        ],\n",
       "        [0.        , 0.        , 0.        , 0.        , 0.4622077 ,\n",
       "         0.        , 0.        , 0.55681615, 0.        , 0.        ,\n",
       "         0.69015927, 0.        , 0.        ],\n",
       "        [0.        , 0.        , 0.        , 0.        , 0.4622077 ,\n",
       "         0.        , 0.        , 0.55681615, 0.69015927, 0.        ,\n",
       "         0.        , 0.        , 0.        ]])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf_matrix.todense()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.885867Z",
     "start_time": "2020-09-30T23:15:49.878116Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[0.57735027, 0.57735027, 0.63907044, 0.5       , 1.35240833,\n",
       "         0.57735027, 0.63907044, 1.1136323 , 0.69015927, 0.5       ,\n",
       "         0.69015927, 0.5       , 0.5       ]])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.sum.html#numpy.sum\n",
    "tfidf_matrix.sum(axis=0)  # Сумма по столбцам"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.896395Z",
     "start_time": "2020-09-30T23:15:49.887149Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[ 3,  9, 11, 12,  0,  1,  5,  2,  6,  8, 10,  7,  4]])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html\n",
    "tfidf_matrix.sum(axis=0).argsort(axis=1)  # Возвращает *индексы*, по возрастанию значений элементов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.910697Z",
     "start_time": "2020-09-30T23:15:49.898006Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 3,  9, 11, 12,  0,  1,  5,  2,  6,  8, 10,  7,  4])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.ravel.html\n",
    "np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.923258Z",
     "start_time": "2020-09-30T23:15:49.911896Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 4,  7, 10,  8,  6,  2,  5,  1,  0, 12, 11,  9,  3])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[::-1]  # Reverse list (DESC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.933466Z",
     "start_time": "2020-09-30T23:15:49.924779Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 4,  7, 10,  8,  6,  2,  5,  1,  0, 12])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[::-1][:N] # Take top N elements"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.947141Z",
     "start_time": "2020-09-30T23:15:49.934612Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 4,  7, 10,  8,  6,  2,  5,  1,  0, 12])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[-N:][::-1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Поиск самого похожего документа"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.966716Z",
     "start_time": "2020-09-30T23:15:49.948679Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.        , 0.19782163, 0.        , 0.52368019])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "similarities = (tfidf_matrix * tfidf_matrix.T).A[-1,:-1]\n",
    "similarities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:49.976618Z",
     "start_time": "2020-09-30T23:15:49.968456Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html\n",
    "#  numpy.argmax(a, axis=None, out=None)\n",
    "# Returns the indices of the maximum values along an axis.\n",
    "max_sim_position = np.argmax(similarities)\n",
    "max_sim_position"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.006256Z",
     "start_time": "2020-09-30T23:15:49.981559Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.52368018715548"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "max_sim = max(similarities)\n",
    "max_sim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.016294Z",
     "start_time": "2020-09-30T23:15:50.008281Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.52368018715548"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "similarities[max_sim_position]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Разбор"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.030833Z",
     "start_time": "2020-09-30T23:15:50.017817Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[0.        , 0.        , 0.        , 0.5       , 0.        ,\n",
       "         0.        , 0.        , 0.        , 0.        , 0.5       ,\n",
       "         0.        , 0.5       , 0.5       ],\n",
       "        [0.        , 0.        , 0.63907044, 0.        , 0.42799292,\n",
       "         0.        , 0.63907044, 0.        , 0.        , 0.        ,\n",
       "         0.        , 0.        , 0.        ],\n",
       "        [0.57735027, 0.57735027, 0.        , 0.        , 0.        ,\n",
       "         0.57735027, 0.        , 0.        , 0.        , 0.        ,\n",
       "         0.        , 0.        , 0.        ],\n",
       "        [0.        , 0.        , 0.        , 0.        , 0.4622077 ,\n",
       "         0.        , 0.        , 0.55681615, 0.        , 0.        ,\n",
       "         0.69015927, 0.        , 0.        ],\n",
       "        [0.        , 0.        , 0.        , 0.        , 0.4622077 ,\n",
       "         0.        , 0.        , 0.55681615, 0.69015927, 0.        ,\n",
       "         0.        , 0.        , 0.        ]])"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf_matrix.todense()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.045030Z",
     "start_time": "2020-09-30T23:15:50.031994Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5, 13)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf_matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.053877Z",
     "start_time": "2020-09-30T23:15:50.046328Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "scipy.sparse.csr.csr_matrix"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(tfidf_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:17:03.579717Z",
     "start_time": "2020-09-30T23:17:03.569840Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[1.        , 0.        , 0.        , 0.        , 0.        ],\n",
       "        [0.        , 1.        , 0.        , 0.19782163, 0.19782163],\n",
       "        [0.        , 0.        , 1.        , 0.        , 0.        ],\n",
       "        [0.        , 0.19782163, 0.        , 1.        , 0.52368019],\n",
       "        [0.        , 0.19782163, 0.        , 0.52368019, 1.        ]])"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mul = tfidf_matrix * tfidf_matrix.T\n",
    "mul.todense()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:17:20.830796Z",
     "start_time": "2020-09-30T23:17:20.821745Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5, 5)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mul.todense().shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:17:24.208235Z",
     "start_time": "2020-09-30T23:17:24.199391Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.        , 0.        , 0.        , 0.        , 0.        ],\n",
       "       [0.        , 1.        , 0.        , 0.19782163, 0.19782163],\n",
       "       [0.        , 0.        , 1.        , 0.        , 0.        ],\n",
       "       [0.        , 0.19782163, 0.        , 1.        , 0.52368019],\n",
       "       [0.        , 0.19782163, 0.        , 0.52368019, 1.        ]])"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores = mul.A\n",
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.103417Z",
     "start_time": "2020-09-30T23:15:50.094348Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.        ,  0.        ,  0.        ,  0.        ,  0.        ],\n",
       "       [ 0.        , -1.        ,  0.        ,  0.19782163,  0.19782163],\n",
       "       [ 0.        ,  0.        , -1.        ,  0.        ,  0.        ],\n",
       "       [ 0.        ,  0.19782163,  0.        , -1.        ,  0.52368019],\n",
       "       [ 0.        ,  0.19782163,  0.        ,  0.52368019, -1.        ]])"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We will search maximum, so do not willing match to himself:\n",
    "np.fill_diagonal(scores, -1)\n",
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.116041Z",
     "start_time": "2020-09-30T23:15:50.104622Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "numpy.ndarray"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.130614Z",
     "start_time": "2020-09-30T23:15:50.117201Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>№</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.197822</td>\n",
       "      <td>0.197822</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.197822</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.523680</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.197822</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.523680</td>\n",
       "      <td>-1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     0         1    2         3         4\n",
       "№                                        \n",
       "1 -1.0  0.000000  0.0  0.000000  0.000000\n",
       "2  0.0 -1.000000  0.0  0.197822  0.197822\n",
       "3  0.0  0.000000 -1.0  0.000000  0.000000\n",
       "4  0.0  0.197822  0.0 -1.000000  0.523680\n",
       "5  0.0  0.197822  0.0  0.523680 -1.000000"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores_df = pd.DataFrame(scores, index=articles.index)\n",
    "scores_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:18:10.999453Z",
     "start_time": "2020-09-30T23:18:10.986455Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[1.        , 0.        , 0.        , 0.        , 0.        ],\n",
       "        [0.        , 1.        , 0.        , 0.19782163, 0.19782163],\n",
       "        [0.        , 0.        , 1.        , 0.        , 0.        ],\n",
       "        [0.        , 0.19782163, 0.        , 1.        , 0.52368019],\n",
       "        [0.        , 0.19782163, 0.        , 0.52368019, 1.        ]])"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# By https://gist.github.com/RZachLamberty/1ed47cd0e2d0d968f7cdbd3d53a50f4c\n",
    "# you can calculate cosine similarity easily given this\n",
    "(tfidf_matrix @ tfidf_matrix.T).todense()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:17:57.242549Z",
     "start_time": "2020-09-30T23:17:57.233091Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.        , 0.        , 0.        , 0.        , 0.        ],\n",
       "       [0.        , 1.        , 0.        , 0.19782163, 0.19782163],\n",
       "       [0.        , 0.        , 1.        , 0.        , 0.        ],\n",
       "       [0.        , 0.19782163, 0.        , 1.        , 0.52368019],\n",
       "       [0.        , 0.19782163, 0.        , 0.52368019, 1.        ]])"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.A.html\n",
    "# Return self as an ndarray object.\n",
    "# Equivalent to np.asarray(self)\n",
    "mul.A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:18:00.735140Z",
     "start_time": "2020-09-30T23:18:00.725172Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.        , 0.19782163, 0.        , 0.52368019])"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mul.A[-1,:-1] # Последняя строка"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.169619Z",
     "start_time": "2020-09-30T23:15:50.162056Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.        , 0.19782163, 0.        , 0.52368019, 1.        ])"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mul.A[-1,0:] # Последняя строка"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.179627Z",
     "start_time": "2020-09-30T23:15:50.171112Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.        , 0.19782163, 0.        , 0.52368019])"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mul.A[-1,:-1] # Последняя строка без последнего элемента.\n",
    "# Матрица диагональная (остальные не имеют значения, зеркально повторяются), содержит веса всех со всеми.\n",
    "# Последний элемент, выкидывается потому что это матч самого к себе, если мы рассматриваем максимальный дубль\n",
    "# для последнего в наборе документа (так было в функции get_similarities из GOJI)\n",
    "\n",
    "# Получается что реально нужно выкидывать по индексу того, для которого ищется сравнение!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Обобщение варианта - без пересчёта матрицы каждый раз!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.187729Z",
     "start_time": "2020-09-30T23:15:50.181109Z"
    }
   },
   "outputs": [],
   "source": [
    "# Получается что реально нужно выкидывать по индексу того, для которого ищется сравнение!\n",
    "forDocNo=1 # Expect match doc 3<>4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.198409Z",
     "start_time": "2020-09-30T23:15:50.188883Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.        ,  0.        ,  0.        ,  0.        ,  0.        ],\n",
       "       [ 0.        , -1.        ,  0.        ,  0.19782163,  0.19782163],\n",
       "       [ 0.        ,  0.        , -1.        ,  0.        ,  0.        ],\n",
       "       [ 0.        ,  0.19782163,  0.        , -1.        ,  0.52368019],\n",
       "       [ 0.        ,  0.19782163,  0.        ,  0.52368019, -1.        ]])"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.207749Z",
     "start_time": "2020-09-30T23:15:50.199654Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3, 0.19782162617776308)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s = scores[forDocNo]\n",
    "max_sim_position = np.argmax(s)\n",
    "\n",
    "(max_sim_position, s[max_sim_position])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Для каждого документа, в DataFrame add \"Max TF/IDF DUP score\" and \"Max DUP score docId\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.222234Z",
     "start_time": "2020-09-30T23:15:50.208989Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Article</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>№</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Раз, два, три, четыре</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>вышел зайчик погулять</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>вдруг охотник выбегает</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>прямо в зайчик стреляет</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>пуляет прямо в зайчик</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   Article\n",
       "№                         \n",
       "1    Раз, два, три, четыре\n",
       "2    вышел зайчик погулять\n",
       "3   вдруг охотник выбегает\n",
       "4  прямо в зайчик стреляет\n",
       "5    пуляет прямо в зайчик"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "articles\n",
    "\n",
    "# Should be matched:\n",
    "# id № Dup№\n",
    "#  0 1 -\n",
    "#  1 2 (4 and 5)\n",
    "#  2 3 -\n",
    "#  3 4 5\n",
    "#  4 5 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.234915Z",
     "start_time": "2020-09-30T23:15:50.223347Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.        ,  0.        ,  0.        ,  0.        ,  0.        ],\n",
       "       [ 0.        , -1.        ,  0.        ,  0.19782163,  0.19782163],\n",
       "       [ 0.        ,  0.        , -1.        ,  0.        ,  0.        ],\n",
       "       [ 0.        ,  0.19782163,  0.        , -1.        ,  0.52368019],\n",
       "       [ 0.        ,  0.19782163,  0.        ,  0.52368019, -1.        ]])"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.253892Z",
     "start_time": "2020-09-30T23:15:50.236266Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>№</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.197822</td>\n",
       "      <td>0.197822</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.197822</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.523680</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.197822</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.523680</td>\n",
       "      <td>-1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     0         1    2         3         4\n",
       "№                                        \n",
       "1 -1.0  0.000000  0.0  0.000000  0.000000\n",
       "2  0.0 -1.000000  0.0  0.197822  0.197822\n",
       "3  0.0  0.000000 -1.0  0.000000  0.000000\n",
       "4  0.0  0.197822  0.0 -1.000000  0.523680\n",
       "5  0.0  0.197822  0.0  0.523680 -1.000000"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.264727Z",
     "start_time": "2020-09-30T23:15:50.255609Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.0\n",
       "1    0.0\n",
       "2   -1.0\n",
       "3    0.0\n",
       "4    0.0\n",
       "Name: 3, dtype: float64"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores_df.loc[3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.275188Z",
     "start_time": "2020-09-30T23:15:50.266014Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.000000\n",
       "1    0.197822\n",
       "2    0.000000\n",
       "3   -1.000000\n",
       "4    0.523680\n",
       "Name: 4, dtype: float64"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores_df.iloc[3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.299798Z",
     "start_time": "2020-09-30T23:15:50.276791Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Article</th>\n",
       "      <th>Max DUP score docId</th>\n",
       "      <th>Max TF/IDF DUP score</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>№</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Раз, два, три, четыре</td>\n",
       "      <td>-1</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>вышел зайчик погулять</td>\n",
       "      <td>4</td>\n",
       "      <td>0.197822</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>вдруг охотник выбегает</td>\n",
       "      <td>-1</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>прямо в зайчик стреляет</td>\n",
       "      <td>5</td>\n",
       "      <td>0.523680</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>пуляет прямо в зайчик</td>\n",
       "      <td>4</td>\n",
       "      <td>0.523680</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   Article  Max DUP score docId  Max TF/IDF DUP score\n",
       "№                                                                    \n",
       "1    Раз, два, три, четыре                   -1              0.000000\n",
       "2    вышел зайчик погулять                    4              0.197822\n",
       "3   вдруг охотник выбегает                   -1              0.000000\n",
       "4  прямо в зайчик стреляет                    5              0.523680\n",
       "5    пуляет прямо в зайчик                    4              0.523680"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# By https://stackoverflow.com/questions/26658240/getting-the-index-of-a-row-in-a-pandas-apply-function/48819898#48819898\n",
    "# index available as row.name\n",
    "def most_similar(row):\n",
    "#     print('====')\n",
    "#     print('row.name={}; scores_df.loc[row.name].idxmax()={}; scores_df.iloc[scores_df.loc[row.name].idxmax()].name={}'.format(\n",
    "#         row.name\n",
    "#         ,scores_df.loc[row.name].idxmax()\n",
    "#         ,scores_df.iloc[scores_df.loc[row.name].idxmax()].name\n",
    "#         )\n",
    "#     )\n",
    "#     print('row.loc={}; row.iloc={}'.format(row.loc, row.iloc))\n",
    "    max_similar_doc = scores_df.iloc[scores_df.loc[row.name].idxmax()].name # Array index (.iloc) into DataFrame index (.name)\n",
    "    max_similar_score = scores_df.loc[row.name].max()\n",
    "    return ((max_similar_doc if max_similar_score > 0 else -1), max_similar_score)\n",
    "\n",
    "# articles['Max DUP score docId'] = articles.apply(lambda i: np.argmax(scores), axis=1)\n",
    "# articles['Max DUP score docId'] = articles.apply(lambda i: type(i.index), axis=1)\n",
    "articles[['Max DUP score docId', 'Max TF/IDF DUP score']] = articles.apply(most_similar, axis=1, result_type='expand')\n",
    "articles['Max DUP score docId'] = articles['Max DUP score docId'].astype('int32')\n",
    "articles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.308302Z",
     "start_time": "2020-09-30T23:15:50.301149Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(№\n",
       " 1    0.000000\n",
       " 2    0.197822\n",
       " 3    0.000000\n",
       " 4   -1.000000\n",
       " 5    0.523680\n",
       " Name: 3, dtype: float64,\n",
       " 0    0.0\n",
       " 1    0.0\n",
       " 2   -1.0\n",
       " 3    0.0\n",
       " 4    0.0\n",
       " Name: 3, dtype: float64)"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "( scores_df[3], scores_df.loc[3] )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Python constructions\n",
    "## Apply with 2 columns at once\n",
    "\n",
    "By https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns/52363890#52363890\n",
    "+By https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns/52363890#comment106834440_16242202 for column naming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.319263Z",
     "start_time": "2020-09-30T23:15:50.309733Z"
    }
   },
   "outputs": [],
   "source": [
    "# articles[['a', 'b']] = articles.apply(lambda i: [1, 2], axis=1, result_type='expand')\n",
    "# articles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-09-30T23:15:50.443806Z",
     "start_time": "2020-09-30T23:15:50.320624Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.series.Series'>\n",
      "<class 'pandas.core.series.Series'>\n"
     ]
    },
    {
     "ename": "KeyError",
     "evalue": "('File name', 'occurred at index 1')",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, series, key)\u001b[0m\n\u001b[1;32m   4735\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4736\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mlibindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value_box\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4737\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.get_value_box\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.get_value_at\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/util.pxd\u001b[0m in \u001b[0;36mpandas._libs.util.get_value_at\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/util.pxd\u001b[0m in \u001b[0;36mpandas._libs.util.validate_indexer\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mTypeError\u001b[0m: 'str' object cannot be interpreted as an integer",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-51-38f66aea0c91>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Article'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0marticles\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdump_xml_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)\u001b[0m\n\u001b[1;32m   6926\u001b[0m             \u001b[0mkwds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   6927\u001b[0m         )\n\u001b[0;32m-> 6928\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   6929\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   6930\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mapplymap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mget_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    184\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_raw\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    185\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 186\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    187\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    188\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mapply_empty_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_standard\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    290\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    291\u001b[0m         \u001b[0;31m# compute the result using the series generator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 292\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_series_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    293\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    294\u001b[0m         \u001b[0;31m# wrap results\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_series_generator\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    319\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    320\u001b[0m                 \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 321\u001b[0;31m                     \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    322\u001b[0m                     \u001b[0mkeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    323\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-51-38f66aea0c91>\u001b[0m in \u001b[0;36mdump_xml_file\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdump_xml_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'File name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m     \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'articles/'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'File name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.xml'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w+'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Article'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   1069\u001b[0m         \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_if_callable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1070\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1071\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1072\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1073\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, series, key)\u001b[0m\n\u001b[1;32m   4742\u001b[0m                     \u001b[0;32mraise\u001b[0m \u001b[0mInvalidIndexError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4743\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4744\u001b[0;31m                     \u001b[0;32mraise\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4745\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m  \u001b[0;31m# pragma: no cover\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4746\u001b[0m                 \u001b[0;32mraise\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, series, key)\u001b[0m\n\u001b[1;32m   4728\u001b[0m         \u001b[0mk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_convert_scalar_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"getitem\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4729\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4730\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtz\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"tz\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4731\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4732\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mholds_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_boolean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_value\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_value\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: ('File name', 'occurred at index 1')"
     ]
    }
   ],
   "source": [
    "def dump_xml_file(row):\n",
    "    print(type(row))\n",
    "    print(row['File name'])\n",
    "    with open('articles/' + row['File name'].replace('.xml', '.txt'), 'w+') as file:\n",
    "        file.write(row['Article'])\n",
    "\n",
    "articles.apply(dump_xml_file, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Docs and links\n",
    "\n",
    "* [sklearn: TFIDF Transformer: Как получить значения tf-idf данных слов в документе](https://www.rupython.com/sklearn-tfidf-transformer-tf-idf-33655.html)\n",
    "* [SO question: Имеется текст, надо вычислить TF-IDF-признаки по имеющимся тексту. Нашел 10 минимальных весов. Требуется найти 10 слов соответствующих абсолютному значению весов. Как можно это сделать?введите сюда описание изображения](https://ru.stackoverflow.com/questions/772859/tfidfvectorizer/773018#773018). С разбором что к чему."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "293.217px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}