{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Preparation"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.180691Z",
"start_time": "2020-09-30T23:15:48.111199Z"
},
"pycharm": {
"is_executing": true
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in /usr/lib64/python3.8/site-packages (0.25.3)\n",
"Requirement already satisfied: numpy in /usr/lib64/python3.8/site-packages (1.18.4)\n",
"Requirement already satisfied: sklearn in /home/pasha/.local/lib/python3.8/site-packages (0.0)\n",
"Requirement already satisfied: nltk in /home/pasha/.local/lib/python3.8/site-packages (3.4.5)\n",
"Requirement already satisfied: python-dateutil>=2.6.1 in /usr/lib/python3.8/site-packages (from pandas) (2.8.0)\n",
"Requirement already satisfied: pytz>=2017.2 in /usr/lib/python3.8/site-packages (from pandas) (2020.1)\n",
"Requirement already satisfied: scikit-learn in /home/pasha/.local/lib/python3.8/site-packages (from sklearn) (0.22.2.post1)\n",
"Requirement already satisfied: six in /usr/lib/python3.8/site-packages (from nltk) (1.14.0)\n",
"Requirement already satisfied: joblib>=0.11 in /home/pasha/.local/lib/python3.8/site-packages (from scikit-learn->sklearn) (0.14.1)\n",
"Requirement already satisfied: scipy>=0.17.0 in /usr/lib64/python3.8/site-packages (from scikit-learn->sklearn) (1.4.1)\n"
]
}
],
"source": [
"import sys\n",
"!{sys.executable} -m pip install --user pandas numpy sklearn nltk"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.728298Z",
"start_time": "2020-09-30T23:15:49.185540Z"
},
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.feature_extraction.text import TfidfVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.732519Z",
"start_time": "2020-09-30T23:15:49.730219Z"
},
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"# Define constants\n",
"tfidf = TfidfVectorizer(stop_words='english')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.747272Z",
"start_time": "2020-09-30T23:15:49.734333Z"
}
},
"outputs": [],
"source": [
"# By https://ru.stackoverflow.com/questions/995616/Как-сделать-tf-idf-для-русских-текстов\n",
"\n",
"#import nltk\n",
"#from nltk.corpus import stopwords as nltk_stopwords\n",
"\n",
"#nltk.download('stopwords')\n",
"#stopwords = set(nltk_stopwords.words('russian') )\n",
"#tfidf = TfidfVectorizer(stop_words=stopwords)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.766942Z",
"start_time": "2020-09-30T23:15:49.748416Z"
},
"pycharm": {
"is_executing": true
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Article | \n",
"
\n",
" \n",
" | № | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" Раз, два, три, четыре | \n",
"
\n",
" \n",
" | 2 | \n",
" вышел зайчик погулять | \n",
"
\n",
" \n",
" | 3 | \n",
" вдруг охотник выбегает | \n",
"
\n",
" \n",
" | 4 | \n",
" прямо в зайчик стреляет | \n",
"
\n",
" \n",
" | 5 | \n",
" пуляет прямо в зайчик | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Article\n",
"№ \n",
"1 Раз, два, три, четыре\n",
"2 вышел зайчик погулять\n",
"3 вдруг охотник выбегает\n",
"4 прямо в зайчик стреляет\n",
"5 пуляет прямо в зайчик"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Minimal example:\n",
"articles = pd.read_csv('data/TF-IDF/TF-IDF-min.csv.Ru', index_col='№')\n",
"# articles = pd.read_csv('data/TF-IDF/TF-IDF-min.csv.En', index_col='№')\n",
"# Full real file:\n",
"# articles = pd.read_csv('RDC-135_articles_golden_set_mapping.csv', index_col='№')\n",
"articles"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.781595Z",
"start_time": "2020-09-30T23:15:49.768242Z"
},
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[0. , 0. , 0. , 0.5 , 0. ,\n",
" 0. , 0. , 0. , 0. , 0.5 ,\n",
" 0. , 0.5 , 0.5 ],\n",
" [0. , 0. , 0.63907044, 0. , 0.42799292,\n",
" 0. , 0.63907044, 0. , 0. , 0. ,\n",
" 0. , 0. , 0. ],\n",
" [0.57735027, 0.57735027, 0. , 0. , 0. ,\n",
" 0.57735027, 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , 0. , 0.4622077 ,\n",
" 0. , 0. , 0.55681615, 0. , 0. ,\n",
" 0.69015927, 0. , 0. ],\n",
" [0. , 0. , 0. , 0. , 0.4622077 ,\n",
" 0. , 0. , 0.55681615, 0.69015927, 0. ,\n",
" 0. , 0. , 0. ]])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfidf_matrix = tfidf.fit_transform(articles['Article'])\n",
"tfidf_matrix.toarray() # or res.todense()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.790933Z",
"start_time": "2020-09-30T23:15:49.782853Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(5, 13)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfidf_matrix.shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.804071Z",
"start_time": "2020-09-30T23:15:49.793459Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"'english'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfidf.stop_words"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Вычисление\n",
"\n",
"## Самые популярные слова"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.813496Z",
"start_time": "2020-09-30T23:15:49.805692Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"['вдруг',\n",
" 'выбегает',\n",
" 'вышел',\n",
" 'два',\n",
" 'зайчик',\n",
" 'охотник',\n",
" 'погулять',\n",
" 'прямо',\n",
" 'пуляет',\n",
" 'раз',\n",
" 'стреляет',\n",
" 'три',\n",
" 'четыре']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# By https://www.rupython.com/sklearn-tfidf-transformer-tf-idf-33655.html\n",
"# Вышеупомянутый X имеет значения TF-IDF всех документов в корпусе. Это большая разреженная матрица.\n",
"# Теперь,\n",
"tfidf.get_feature_names()\n",
"# это дает вам список всех токенов или n-граммов или слов. Для первого документа в вашем корпусе, "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.823154Z",
"start_time": "2020-09-30T23:15:49.814805Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'раз': 9,\n",
" 'два': 3,\n",
" 'три': 11,\n",
" 'четыре': 12,\n",
" 'вышел': 2,\n",
" 'зайчик': 4,\n",
" 'погулять': 6,\n",
" 'вдруг': 0,\n",
" 'охотник': 5,\n",
" 'выбегает': 1,\n",
" 'прямо': 7,\n",
" 'стреляет': 10,\n",
" 'пуляет': 8}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# By https://stackoverflow.com/questions/37593293/what-is-the-simplest-way-to-get-tfidf-with-pandas-dataframe#comment72191707_37593408\n",
"# v.get_feature_names() will give you the list of feature names.\n",
"# v.vocabulary_ will give you a dict with feature names as keys and their index in the matrix produced as values.\n",
"tfidf.vocabulary_"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.837422Z",
"start_time": "2020-09-30T23:15:49.824270Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"### doc 0: ###\n",
"четыре 0.5\n",
"три 0.5\n",
"два 0.5\n",
"раз 0.5\n",
"### doc 1: ###\n",
"погулять 0.6390704413963749\n",
"зайчик 0.42799292268317357\n",
"вышел 0.6390704413963749\n",
"### doc 2: ###\n",
"выбегает 0.5773502691896258\n",
"охотник 0.5773502691896258\n",
"вдруг 0.5773502691896258\n",
"### doc 3: ###\n",
"стреляет 0.6901592662889633\n",
"прямо 0.5568161504458247\n",
"зайчик 0.46220770413113277\n",
"### doc 4: ###\n",
"пуляет 0.6901592662889633\n",
"прямо 0.5568161504458247\n",
"зайчик 0.46220770413113277\n"
]
}
],
"source": [
"# By https://www.rupython.com/sklearn-tfidf-transformer-tf-idf-33655.html\n",
"# это дает вам список всех токенов или n-граммов или слов. Для первого документа в вашем корпусе, \n",
"# Позволяет распечатать их:\n",
"def print_word_ratings_by_document(doc=0):\n",
" feature_names = tfidf.get_feature_names()\n",
" feature_index = tfidf_matrix[doc,:].nonzero()[1]\n",
" tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index]) \n",
" for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:\n",
" print (w, s)\n",
"\n",
"for i, _ in articles.iterrows():\n",
" print(\"### doc {}: ###\".format(i-1))\n",
" print_word_ratings_by_document(i-1)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.854958Z",
"start_time": "2020-09-30T23:15:49.838858Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"['зайчик',\n",
" 'прямо',\n",
" 'стреляет',\n",
" 'пуляет',\n",
" 'погулять',\n",
" 'вышел',\n",
" 'охотник',\n",
" 'выбегает',\n",
" 'вдруг',\n",
" 'четыре']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# https://ru.stackoverflow.com/questions/772859/tfidfvectorizer/773018#773018\n",
"# Топ 10 самых популярных слов:\n",
"N=10\n",
"idx = np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[::-1][:N]\n",
"top_10_words = np.array(tfidf.get_feature_names())[idx].tolist()\n",
"top_10_words"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Разбор"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.866125Z",
"start_time": "2020-09-30T23:15:49.856417Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(5, 13)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfidf_matrix.shape"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.876690Z",
"start_time": "2020-09-30T23:15:49.867221Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[0. , 0. , 0. , 0.5 , 0. ,\n",
" 0. , 0. , 0. , 0. , 0.5 ,\n",
" 0. , 0.5 , 0.5 ],\n",
" [0. , 0. , 0.63907044, 0. , 0.42799292,\n",
" 0. , 0.63907044, 0. , 0. , 0. ,\n",
" 0. , 0. , 0. ],\n",
" [0.57735027, 0.57735027, 0. , 0. , 0. ,\n",
" 0.57735027, 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , 0. , 0.4622077 ,\n",
" 0. , 0. , 0.55681615, 0. , 0. ,\n",
" 0.69015927, 0. , 0. ],\n",
" [0. , 0. , 0. , 0. , 0.4622077 ,\n",
" 0. , 0. , 0.55681615, 0.69015927, 0. ,\n",
" 0. , 0. , 0. ]])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfidf_matrix.todense()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.885867Z",
"start_time": "2020-09-30T23:15:49.878116Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[0.57735027, 0.57735027, 0.63907044, 0.5 , 1.35240833,\n",
" 0.57735027, 0.63907044, 1.1136323 , 0.69015927, 0.5 ,\n",
" 0.69015927, 0.5 , 0.5 ]])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.sum.html#numpy.sum\n",
"tfidf_matrix.sum(axis=0) # Сумма по столбцам"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.896395Z",
"start_time": "2020-09-30T23:15:49.887149Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[ 3, 9, 11, 12, 0, 1, 5, 2, 6, 8, 10, 7, 4]])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html\n",
"tfidf_matrix.sum(axis=0).argsort(axis=1) # Возвращает *индексы*, по возрастанию значений элементов"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.910697Z",
"start_time": "2020-09-30T23:15:49.898006Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 3, 9, 11, 12, 0, 1, 5, 2, 6, 8, 10, 7, 4])"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.ravel.html\n",
"np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.923258Z",
"start_time": "2020-09-30T23:15:49.911896Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 4, 7, 10, 8, 6, 2, 5, 1, 0, 12, 11, 9, 3])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[::-1] # Reverse list (DESC)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.933466Z",
"start_time": "2020-09-30T23:15:49.924779Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 4, 7, 10, 8, 6, 2, 5, 1, 0, 12])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[::-1][:N] # Take top N elements"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.947141Z",
"start_time": "2020-09-30T23:15:49.934612Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 4, 7, 10, 8, 6, 2, 5, 1, 0, 12])"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.ravel(tfidf_matrix.sum(axis=0).argsort(axis=1))[-N:][::-1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Поиск самого похожего документа"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.966716Z",
"start_time": "2020-09-30T23:15:49.948679Z"
},
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0. , 0.19782163, 0. , 0.52368019])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"similarities = (tfidf_matrix * tfidf_matrix.T).A[-1,:-1]\n",
"similarities"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:49.976618Z",
"start_time": "2020-09-30T23:15:49.968456Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html\n",
"# numpy.argmax(a, axis=None, out=None)\n",
"# Returns the indices of the maximum values along an axis.\n",
"max_sim_position = np.argmax(similarities)\n",
"max_sim_position"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.006256Z",
"start_time": "2020-09-30T23:15:49.981559Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"0.52368018715548"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"max_sim = max(similarities)\n",
"max_sim"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.016294Z",
"start_time": "2020-09-30T23:15:50.008281Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.52368018715548"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"similarities[max_sim_position]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Разбор"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.030833Z",
"start_time": "2020-09-30T23:15:50.017817Z"
},
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[0. , 0. , 0. , 0.5 , 0. ,\n",
" 0. , 0. , 0. , 0. , 0.5 ,\n",
" 0. , 0.5 , 0.5 ],\n",
" [0. , 0. , 0.63907044, 0. , 0.42799292,\n",
" 0. , 0.63907044, 0. , 0. , 0. ,\n",
" 0. , 0. , 0. ],\n",
" [0.57735027, 0.57735027, 0. , 0. , 0. ,\n",
" 0.57735027, 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , 0. , 0.4622077 ,\n",
" 0. , 0. , 0.55681615, 0. , 0. ,\n",
" 0.69015927, 0. , 0. ],\n",
" [0. , 0. , 0. , 0. , 0.4622077 ,\n",
" 0. , 0. , 0.55681615, 0.69015927, 0. ,\n",
" 0. , 0. , 0. ]])"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfidf_matrix.todense()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.045030Z",
"start_time": "2020-09-30T23:15:50.031994Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(5, 13)"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfidf_matrix.shape"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.053877Z",
"start_time": "2020-09-30T23:15:50.046328Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"scipy.sparse.csr.csr_matrix"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(tfidf_matrix)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:17:03.579717Z",
"start_time": "2020-09-30T23:17:03.569840Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[1. , 0. , 0. , 0. , 0. ],\n",
" [0. , 1. , 0. , 0.19782163, 0.19782163],\n",
" [0. , 0. , 1. , 0. , 0. ],\n",
" [0. , 0.19782163, 0. , 1. , 0.52368019],\n",
" [0. , 0.19782163, 0. , 0.52368019, 1. ]])"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mul = tfidf_matrix * tfidf_matrix.T\n",
"mul.todense()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:17:20.830796Z",
"start_time": "2020-09-30T23:17:20.821745Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"(5, 5)"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mul.todense().shape"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:17:24.208235Z",
"start_time": "2020-09-30T23:17:24.199391Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1. , 0. , 0. , 0. , 0. ],\n",
" [0. , 1. , 0. , 0.19782163, 0.19782163],\n",
" [0. , 0. , 1. , 0. , 0. ],\n",
" [0. , 0.19782163, 0. , 1. , 0.52368019],\n",
" [0. , 0.19782163, 0. , 0.52368019, 1. ]])"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores = mul.A\n",
"scores"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.103417Z",
"start_time": "2020-09-30T23:15:50.094348Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[-1. , 0. , 0. , 0. , 0. ],\n",
" [ 0. , -1. , 0. , 0.19782163, 0.19782163],\n",
" [ 0. , 0. , -1. , 0. , 0. ],\n",
" [ 0. , 0.19782163, 0. , -1. , 0.52368019],\n",
" [ 0. , 0.19782163, 0. , 0.52368019, -1. ]])"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We will search maximum, so do not willing match to himself:\n",
"np.fill_diagonal(scores, -1)\n",
"scores"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.116041Z",
"start_time": "2020-09-30T23:15:50.104622Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"numpy.ndarray"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(scores)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.130614Z",
"start_time": "2020-09-30T23:15:50.117201Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" | № | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" -1.0 | \n",
" 0.000000 | \n",
" 0.0 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.0 | \n",
" -1.000000 | \n",
" 0.0 | \n",
" 0.197822 | \n",
" 0.197822 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0.0 | \n",
" 0.000000 | \n",
" -1.0 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0.0 | \n",
" 0.197822 | \n",
" 0.0 | \n",
" -1.000000 | \n",
" 0.523680 | \n",
"
\n",
" \n",
" | 5 | \n",
" 0.0 | \n",
" 0.197822 | \n",
" 0.0 | \n",
" 0.523680 | \n",
" -1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4\n",
"№ \n",
"1 -1.0 0.000000 0.0 0.000000 0.000000\n",
"2 0.0 -1.000000 0.0 0.197822 0.197822\n",
"3 0.0 0.000000 -1.0 0.000000 0.000000\n",
"4 0.0 0.197822 0.0 -1.000000 0.523680\n",
"5 0.0 0.197822 0.0 0.523680 -1.000000"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores_df = pd.DataFrame(scores, index=articles.index)\n",
"scores_df"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:18:10.999453Z",
"start_time": "2020-09-30T23:18:10.986455Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[1. , 0. , 0. , 0. , 0. ],\n",
" [0. , 1. , 0. , 0.19782163, 0.19782163],\n",
" [0. , 0. , 1. , 0. , 0. ],\n",
" [0. , 0.19782163, 0. , 1. , 0.52368019],\n",
" [0. , 0.19782163, 0. , 0.52368019, 1. ]])"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# By https://gist.github.com/RZachLamberty/1ed47cd0e2d0d968f7cdbd3d53a50f4c\n",
"# you can calculate cosine similarity easily given this\n",
"(tfidf_matrix @ tfidf_matrix.T).todense()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:17:57.242549Z",
"start_time": "2020-09-30T23:17:57.233091Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1. , 0. , 0. , 0. , 0. ],\n",
" [0. , 1. , 0. , 0.19782163, 0.19782163],\n",
" [0. , 0. , 1. , 0. , 0. ],\n",
" [0. , 0.19782163, 0. , 1. , 0.52368019],\n",
" [0. , 0.19782163, 0. , 0.52368019, 1. ]])"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Doc: https://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.A.html\n",
"# Return self as an ndarray object.\n",
"# Equivalent to np.asarray(self)\n",
"mul.A"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:18:00.735140Z",
"start_time": "2020-09-30T23:18:00.725172Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([0. , 0.19782163, 0. , 0.52368019])"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mul.A[-1,:-1] # Последняя строка"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.169619Z",
"start_time": "2020-09-30T23:15:50.162056Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([0. , 0.19782163, 0. , 0.52368019, 1. ])"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mul.A[-1,0:] # Последняя строка"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.179627Z",
"start_time": "2020-09-30T23:15:50.171112Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([0. , 0.19782163, 0. , 0.52368019])"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mul.A[-1,:-1] # Последняя строка без последнего элемента.\n",
"# Матрица диагональная (остальные не имеют значения, зеркально повторяются), содержит веса всех со всеми.\n",
"# Последний элемент, выкидывается потому что это матч самого к себе, если мы рассматриваем максимальный дубль\n",
"# для последнего в наборе документа (так было в функции get_similarities из GOJI)\n",
"\n",
"# Получается что реально нужно выкидывать по индексу того, для которого ищется сравнение!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Обобщение варианта - без пересчёта матрицы каждый раз!"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.187729Z",
"start_time": "2020-09-30T23:15:50.181109Z"
}
},
"outputs": [],
"source": [
"# Получается что реально нужно выкидывать по индексу того, для которого ищется сравнение!\n",
"forDocNo=1 # Expect match doc 3<>4"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.198409Z",
"start_time": "2020-09-30T23:15:50.188883Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([[-1. , 0. , 0. , 0. , 0. ],\n",
" [ 0. , -1. , 0. , 0.19782163, 0.19782163],\n",
" [ 0. , 0. , -1. , 0. , 0. ],\n",
" [ 0. , 0.19782163, 0. , -1. , 0.52368019],\n",
" [ 0. , 0.19782163, 0. , 0.52368019, -1. ]])"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.207749Z",
"start_time": "2020-09-30T23:15:50.199654Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(3, 0.19782162617776308)"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s = scores[forDocNo]\n",
"max_sim_position = np.argmax(s)\n",
"\n",
"(max_sim_position, s[max_sim_position])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Для каждого документа, в DataFrame add \"Max TF/IDF DUP score\" and \"Max DUP score docId\""
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.222234Z",
"start_time": "2020-09-30T23:15:50.208989Z"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Article | \n",
"
\n",
" \n",
" | № | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" Раз, два, три, четыре | \n",
"
\n",
" \n",
" | 2 | \n",
" вышел зайчик погулять | \n",
"
\n",
" \n",
" | 3 | \n",
" вдруг охотник выбегает | \n",
"
\n",
" \n",
" | 4 | \n",
" прямо в зайчик стреляет | \n",
"
\n",
" \n",
" | 5 | \n",
" пуляет прямо в зайчик | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Article\n",
"№ \n",
"1 Раз, два, три, четыре\n",
"2 вышел зайчик погулять\n",
"3 вдруг охотник выбегает\n",
"4 прямо в зайчик стреляет\n",
"5 пуляет прямо в зайчик"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"articles\n",
"\n",
"# Should be matched:\n",
"# id № Dup№\n",
"# 0 1 -\n",
"# 1 2 (4 and 5)\n",
"# 2 3 -\n",
"# 3 4 5\n",
"# 4 5 4"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.234915Z",
"start_time": "2020-09-30T23:15:50.223347Z"
},
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[-1. , 0. , 0. , 0. , 0. ],\n",
" [ 0. , -1. , 0. , 0.19782163, 0.19782163],\n",
" [ 0. , 0. , -1. , 0. , 0. ],\n",
" [ 0. , 0.19782163, 0. , -1. , 0.52368019],\n",
" [ 0. , 0.19782163, 0. , 0.52368019, -1. ]])"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.253892Z",
"start_time": "2020-09-30T23:15:50.236266Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" | № | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" -1.0 | \n",
" 0.000000 | \n",
" 0.0 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.0 | \n",
" -1.000000 | \n",
" 0.0 | \n",
" 0.197822 | \n",
" 0.197822 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0.0 | \n",
" 0.000000 | \n",
" -1.0 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0.0 | \n",
" 0.197822 | \n",
" 0.0 | \n",
" -1.000000 | \n",
" 0.523680 | \n",
"
\n",
" \n",
" | 5 | \n",
" 0.0 | \n",
" 0.197822 | \n",
" 0.0 | \n",
" 0.523680 | \n",
" -1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4\n",
"№ \n",
"1 -1.0 0.000000 0.0 0.000000 0.000000\n",
"2 0.0 -1.000000 0.0 0.197822 0.197822\n",
"3 0.0 0.000000 -1.0 0.000000 0.000000\n",
"4 0.0 0.197822 0.0 -1.000000 0.523680\n",
"5 0.0 0.197822 0.0 0.523680 -1.000000"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores_df"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.264727Z",
"start_time": "2020-09-30T23:15:50.255609Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0 0.0\n",
"1 0.0\n",
"2 -1.0\n",
"3 0.0\n",
"4 0.0\n",
"Name: 3, dtype: float64"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores_df.loc[3]"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.275188Z",
"start_time": "2020-09-30T23:15:50.266014Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0 0.000000\n",
"1 0.197822\n",
"2 0.000000\n",
"3 -1.000000\n",
"4 0.523680\n",
"Name: 4, dtype: float64"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores_df.iloc[3]"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.299798Z",
"start_time": "2020-09-30T23:15:50.276791Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Article | \n",
" Max DUP score docId | \n",
" Max TF/IDF DUP score | \n",
"
\n",
" \n",
" | № | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" Раз, два, три, четыре | \n",
" -1 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 2 | \n",
" вышел зайчик погулять | \n",
" 4 | \n",
" 0.197822 | \n",
"
\n",
" \n",
" | 3 | \n",
" вдруг охотник выбегает | \n",
" -1 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" | 4 | \n",
" прямо в зайчик стреляет | \n",
" 5 | \n",
" 0.523680 | \n",
"
\n",
" \n",
" | 5 | \n",
" пуляет прямо в зайчик | \n",
" 4 | \n",
" 0.523680 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Article Max DUP score docId Max TF/IDF DUP score\n",
"№ \n",
"1 Раз, два, три, четыре -1 0.000000\n",
"2 вышел зайчик погулять 4 0.197822\n",
"3 вдруг охотник выбегает -1 0.000000\n",
"4 прямо в зайчик стреляет 5 0.523680\n",
"5 пуляет прямо в зайчик 4 0.523680"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# By https://stackoverflow.com/questions/26658240/getting-the-index-of-a-row-in-a-pandas-apply-function/48819898#48819898\n",
"# index available as row.name\n",
"def most_similar(row):\n",
"# print('====')\n",
"# print('row.name={}; scores_df.loc[row.name].idxmax()={}; scores_df.iloc[scores_df.loc[row.name].idxmax()].name={}'.format(\n",
"# row.name\n",
"# ,scores_df.loc[row.name].idxmax()\n",
"# ,scores_df.iloc[scores_df.loc[row.name].idxmax()].name\n",
"# )\n",
"# )\n",
"# print('row.loc={}; row.iloc={}'.format(row.loc, row.iloc))\n",
" max_similar_doc = scores_df.iloc[scores_df.loc[row.name].idxmax()].name # Array index (.iloc) into DataFrame index (.name)\n",
" max_similar_score = scores_df.loc[row.name].max()\n",
" return ((max_similar_doc if max_similar_score > 0 else -1), max_similar_score)\n",
"\n",
"# articles['Max DUP score docId'] = articles.apply(lambda i: np.argmax(scores), axis=1)\n",
"# articles['Max DUP score docId'] = articles.apply(lambda i: type(i.index), axis=1)\n",
"articles[['Max DUP score docId', 'Max TF/IDF DUP score']] = articles.apply(most_similar, axis=1, result_type='expand')\n",
"articles['Max DUP score docId'] = articles['Max DUP score docId'].astype('int32')\n",
"articles"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.308302Z",
"start_time": "2020-09-30T23:15:50.301149Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"(№\n",
" 1 0.000000\n",
" 2 0.197822\n",
" 3 0.000000\n",
" 4 -1.000000\n",
" 5 0.523680\n",
" Name: 3, dtype: float64,\n",
" 0 0.0\n",
" 1 0.0\n",
" 2 -1.0\n",
" 3 0.0\n",
" 4 0.0\n",
" Name: 3, dtype: float64)"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"( scores_df[3], scores_df.loc[3] )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Python constructions\n",
"## Apply with 2 columns at once\n",
"\n",
"By https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns/52363890#52363890\n",
"+By https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns/52363890#comment106834440_16242202 for column naming"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.319263Z",
"start_time": "2020-09-30T23:15:50.309733Z"
}
},
"outputs": [],
"source": [
"# articles[['a', 'b']] = articles.apply(lambda i: [1, 2], axis=1, result_type='expand')\n",
"# articles"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"ExecuteTime": {
"end_time": "2020-09-30T23:15:50.443806Z",
"start_time": "2020-09-30T23:15:50.320624Z"
},
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"ename": "KeyError",
"evalue": "('File name', 'occurred at index 1')",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, series, key)\u001b[0m\n\u001b[1;32m 4735\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4736\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mlibindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value_box\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4737\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.get_value_box\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.get_value_at\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/util.pxd\u001b[0m in \u001b[0;36mpandas._libs.util.get_value_at\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/util.pxd\u001b[0m in \u001b[0;36mpandas._libs.util.validate_indexer\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: 'str' object cannot be interpreted as an integer",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Article'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0marticles\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdump_xml_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)\u001b[0m\n\u001b[1;32m 6926\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6927\u001b[0m )\n\u001b[0;32m-> 6928\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6929\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6930\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapplymap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mget_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_raw\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 186\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 187\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 188\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_empty_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_standard\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 290\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[0;31m# compute the result using the series generator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 292\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_series_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 293\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 294\u001b[0m \u001b[0;31m# wrap results\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_series_generator\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 319\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 320\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 321\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 322\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m\u001b[0m in \u001b[0;36mdump_xml_file\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdump_xml_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'File name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'articles/'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'File name'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.xml'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w+'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Article'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1069\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_if_callable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1070\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1071\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1072\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1073\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, series, key)\u001b[0m\n\u001b[1;32m 4742\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mInvalidIndexError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4743\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4744\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4745\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# pragma: no cover\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4746\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib64/python3.8/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, series, key)\u001b[0m\n\u001b[1;32m 4728\u001b[0m \u001b[0mk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_convert_scalar_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"getitem\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4729\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4730\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtz\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"tz\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4731\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4732\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mholds_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_boolean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_value\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_value\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: ('File name', 'occurred at index 1')"
]
}
],
"source": [
"def dump_xml_file(row):\n",
" print(type(row))\n",
" print(row['File name'])\n",
" with open('articles/' + row['File name'].replace('.xml', '.txt'), 'w+') as file:\n",
" file.write(row['Article'])\n",
"\n",
"articles.apply(dump_xml_file, axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Docs and links\n",
"\n",
"* [sklearn: TFIDF Transformer: Как получить значения tf-idf данных слов в документе](https://www.rupython.com/sklearn-tfidf-transformer-tf-idf-33655.html)\n",
"* [SO question: Имеется текст, надо вычислить TF-IDF-признаки по имеющимся тексту. Нашел 10 минимальных весов. Требуется найти 10 слов соответствующих абсолютному значению весов. Как можно это сделать?введите сюда описание изображения](https://ru.stackoverflow.com/questions/772859/tfidfvectorizer/773018#773018). С разбором что к чему."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "293.217px"
},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 2
}