{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from operator import itemgetter\n", "from concurrent.futures import ProcessPoolExecutor \n", "\n", "import os \n", "import gensim\n", "import arxiv\n", "import pandas as pd\n", "\n", "import itertools\n", "\n", "import numpy as np\n", "\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "import scikitplot\n", "\n", "from nltk.stem import WordNetLemmatizer\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, LabelEncoder\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", "from sklearn.multiclass import OneVsRestClassifier\n", "from sklearn.model_selection import train_test_split\n", "\n", "from fastFM import sgd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data preprocessing" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "lemmatizer = WordNetLemmatizer()\n", "\n", "\n", "def stem(text):\n", " return lemmatizer.lemmatize(text)\n", "\n", "\n", "def map_parallel(f, iterable, **kwargs):\n", " with ProcessPoolExecutor() as pool:\n", " result = pool.map(f, iterable, **kwargs)\n", " return result\n", "\n", "\n", "def retrieve_articles(start, chunksize=1000):\n", " return arxiv.query(\n", " search_query=search_query,\n", " start=start,\n", " max_results=chunksize\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Actual text mining functions" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def vectorize_text(examples_df):\n", "\n", " vectorizer = CountVectorizer(min_df=2)\n", " features = vectorizer.fit_transform(examples_df['summary'])\n", "\n", " le = LabelEncoder()\n", " ohe = OneHotEncoder()\n", " labels = le.fit_transform(valid_example_categories).reshape(-1, 1)\n", " labels_ohe = ohe.fit_transform(labels).todense()\n", " vectorized_data = {\n", " 'features': features,\n", " 'labels': labels,\n", " 'labels_onehot' : labels_ohe\n", " }\n", " return vectorized_data, (ohe, le)\n", "\n", "\n", "def extract_keywords(text):\n", " \"\"\"\n", " Use gensim's textrank-based approach\n", " \"\"\"\n", " return gensim.summarization.keywords(\n", " text=text,\n", " lemmatize=True,\n", " split=True\n", " )\n", "\n", "\n", "def extract_mz_keywords(text):\n", " \"\"\"\n", " Use gensim's Montemurro-Zanette method implementation\n", " \"\"\"\n", " return gensim.summarization.mz_keywords(\n", " text=stem(text),\n", " blocksize=32,\n", " split=True\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Factorization machine utils" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "class FMClassifier(sgd.FMClassification):\n", " \"\"\"\n", " Wrapper for fastFM estimator that makes it behave like sklearn ones\n", " \"\"\"\n", " \n", " def fit(self, X, y, *args):\n", " y = y.copy()\n", " y[y == 0] = -1\n", " return super(FMClassifier, self).fit(X, y, *args)\n", "\n", " def predict_proba(self, X):\n", " probs = super(FMClassifier, self).predict_proba(X)\n", " return np.tile(probs, 2).reshape(2, probs.shape[0]).T\n", " \n", "\n", "def predict_ovr(model, X):\n", " \"\"\"\n", " predict as multiclass (standard OVR behaves as predicting multilabel)\n", " \"\"\"\n", " return np.argmax(model.predict_proba(X), 1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def filter_out_small_categories(df, categories, threshold=200):\n", "\n", " class_counts = categories.value_counts()\n", " too_small_classes = class_counts[class_counts < threshold].index\n", " too_small_classes\n", "\n", " valid_example_indices = ~categories.isin(too_small_classes)\n", " valid_examples = df[valid_example_indices]\n", " valid_example_categories = categories[valid_example_indices]\n", " \n", " return valid_examples, valid_example_categories" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plotting utils" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def report_classification_confusion_matrix(y, y_pred, label_encoder):\n", "\n", " y_test_pred_label_names = label_encoder.inverse_transform(y_pred)\n", " y_test_label_names = label_encoder.inverse_transform(y.reshape(-1))\n", "\n", " print(classification_report(y_test_label_names, y_test_pred_label_names))\n", "\n", " \n", " scikitplot.metrics.plot_confusion_matrix(\n", " y_test_label_names,\n", " y_test_pred_label_names,\n", " hide_zeros=True,\n", " x_tick_rotation=90\n", " )\n", " plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load ML articles from arxiv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "%%time\n", "\n", "search_query = 'matrix factorization'\n", "\n", "max_n_articles = 10000\n", "chunksize = 1000\n", "\n", "\n", "# we need to use def since lambdas can't be pickled\n", "def retrieve_chunk(chunk_start):\n", " return arxiv.query(\n", " search_query=search_query,\n", " start=chunk_start,\n", " max_results=chunksize\n", " )\n", "\n", "\n", "result_chunks = list(\n", " map_parallel(\n", " retrieve_chunk,\n", " range(0, max_n_articles, chunksize)\n", " )\n", ")\n", "\n", "results = list(itertools.chain(*result_chunks))\n", "\n", "print('Retrieved {} articles'.format(len(results)))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Retrieved 10000 articles\n" ] } ], "source": [ "%store -r results\n", "print('Retrieved {} articles'.format(len(results)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Display some basic information" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "********************\n", "Approximate Method of Variational Bayesian Matrix\n", " Factorization/Completion with Sparse Prior\n", "********************\n", "Ryota Kawasumi, Koujin Takeda\n", "2018-03-14T13:54:23Z\n", "We derive analytical expression of matrix factorization/completion solution\n", "by variational Bayes method, under the assumption that observed matrix is\n", "originally the product of low-rank dense and sparse matrices with additive\n", "noise. We assume the prior of sparse matrix is Laplace distribution by taking\n", "matrix sparsity into consideration. Then we use several approximations for\n", "derivation of matrix factorization/completion solution. By our solution, we\n", "also numerically evaluate the performance of sparse matrix reconstruction in\n", "matrix factorization, and completion of missing matrix element in matrix\n", "completion.\n", "\n", "********************\n", "A New Method of Matrix Spectral Factorization\n", "********************\n", "Gigla Janashia, Edem Lagvilava, Lasha Ephremidze\n", "2009-09-29T15:08:13Z\n", "A new method of matrix spectral factorization is proposed which reliably\n", "computes an approximate spectral factor of any matrix spectral density that\n", "admits spectral factorization\n", "\n", "********************\n", "Matrix Factorizations via the Inverse Function Theorem\n", "********************\n", "Paul W. Y. Lee\n", "2014-08-12T03:29:00Z\n", "We give proofs of QR factorization, Cholesky's factorization, and LDU\n", "factorization using the inverse function theorem. As a consequence, we obtain\n", "analytic dependence of these matrix factorizations which does not follow\n", "immediately using Gaussian elimination.\n", "\n", "********************\n", "The Reciprocal Pascal Matrix\n", "********************\n", "Thomas M. Richardson\n", "2014-05-24T16:16:58Z\n", "The reciprocal Pascal matrix is the Hadamard inverse of the symmetric Pascal\n", "matrix. We show that the ordinary matrix inverse of the reciprocal Pascal\n", "matrix has integer elements. The proof uses two factorizations of the matrix of\n", "super Catalan numbers.\n", "\n", "********************\n", "Invariance properties of thematic factorizations of matrix functions\n", "********************\n", "R. B. Alexeev, V. V. Peller\n", "2001-01-26T21:54:16Z\n", "We study the problem of invariance of indices of thematic factorizations.\n", "Such factorizations were introduced in [PY1] for studying superoptimal\n", "approximation by bounded analytic matrix functions. As shown in [PY1], the\n", "indices may depend on the choice of a thematic factorization. We introduce the\n", "notion of a monotone thematic factorization. The main result shows that under\n", "natural assumptions a matrix function that admits a thematic factorization also\n", "admits a monotone thematic factorization and the indices of a monotone thematic\n", "factorization are uniquely determined by the matrix function itself. We obtain\n", "similar results for so-called partial thematic factorizations.\n", "\n", "********************\n", "Online Matrix Factorization via Broyden Updates\n", "********************\n", "Ömer Deniz Akyıldız\n", "2015-06-26T07:11:17Z\n", "In this paper, we propose an online algorithm to compute matrix\n", "factorizations. Proposed algorithm updates the dictionary matrix and associated\n", "coefficients using a single observation at each time. The algorithm performs\n", "low-rank updates to dictionary matrix. We derive the algorithm by defining a\n", "simple objective function to minimize whenever an observation is arrived. We\n", "extend the algorithm further for handling missing data. We also provide a\n", "mini-batch extension which enables to compute the matrix factorization on big\n", "datasets. We demonstrate the efficiency of our algorithm on a real dataset and\n", "give comparisons with well-known algorithms such as stochastic gradient matrix\n", "factorization and nonnegative matrix factorization (NMF).\n", "\n", "********************\n", "Matrix factorizations and intertwiners of the fundamental\n", " representations of quantum group U_q (sl_n)\n", "********************\n", "Yasuyoshi Yonezawa\n", "2008-06-30T17:13:00Z\n", "We want to construct a homological link invariant whose Euler characteristic\n", "is MOY polynomial as Khovanov and Rozansky constructed a categorification of\n", "HOMFLY polynomial. The present paper gives the first step to construct a\n", "categorification of MOY polynomial. For the essential colored planar diagrams\n", "with additional data which is a sequence naturally induced by coloring, we\n", "define matrix factorizations, and then we define a matrix factorization for\n", "planar diagram obtained by gluing the essential colored planar diagrams as\n", "tensor product of the matrix factorizations for the essential planar diagrams.\n", "Moreover, we show that some matrix factorizations deribed from tensor product\n", "of the essential matrix factorizations have homotopy equivalences corresponding\n", "to MOY relations.\n", "\n", "********************\n", "Fundamental matrix factorization in the FJRW-theory revisited\n", "********************\n", "Alexander Polishchuk\n", "2017-12-26T21:08:40Z\n", "We present an improved construction of the fundamental matrix factorization\n", "in the FJRW-theory given in arXiv:1105.2903. The revised construction is\n", "coordinate-free and works for a possibly nonabelian finite group of symmetries.\n", "One of the new ingrediants is the category of dg-matrix factorizations over a\n", "dg-scheme.\n", "\n", "********************\n", "Matrix factorizations and double line in $\\mathfrak{sl}_n$ quantum link\n", " invariant\n", "********************\n", "Yasuyoshi Yonezawa\n", "2007-03-28T07:26:02Z\n", "This article gives matrix factorizations for the trivalent diagrams and\n", "double line appearing in $\\mathfrak{sl}_n$ quantum link invariant.\n", " These matrix factorizations reconstruct Khovanov-Rozansky homology. And we\n", "show that the Euler characteristic of the matrix factorization for a double\n", "loop equals the quantum dimension of the representation $\\land^2 V$ of $U_q\n", "(\\mathfrak{sl}_n)$ in Section \\ref{sec3.3}.\n", "\n", "********************\n", "Finiteness of small factor analysis models\n", "********************\n", "Mathias Drton, Han Xiao\n", "2009-08-12T15:42:31Z\n", "We consider small factor analysis models with one or two factors. Fixing the\n", "number of factors, we prove a finiteness result about the covariance matrix\n", "parameter space when the size of the covariance matrix increases. According to\n", "this result, there exists a distinguished matrix size starting at which one can\n", "determine whether a given covariance matrix belongs to the parameter space by\n", "determining whether all principal submatrices of the distinguished size belong\n", "to the corresponding parameter space. We show that the distinguished matrix\n", "size is equal to four in the one-factor model and six with two factors.\n", "\n", "********************\n", "Stochastic Matrix Factorization\n", "********************\n", "Christopher Adams\n", "2016-09-19T15:19:44Z\n", "This paper considers a restriction to non-negative matrix factorization in\n", "which at least one matrix factor is stochastic. That is, the elements of the\n", "matrix factors are non-negative and the columns of one matrix factor sum to 1.\n", "This restriction includes topic models, a popular method for analyzing\n", "unstructured data. It also includes a method for storing and finding pictures.\n", "The paper presents necessary and sufficient conditions on the observed data\n", "such that the factorization is unique. In addition, the paper characterizes\n", "natural bounds on the parameters for any observed data and presents a\n", "consistent least squares estimator. The results are illustrated using a topic\n", "model analysis of PhD abstracts in economics and the problem of storing and\n", "retrieving a set of pictures of faces.\n", "\n", "********************\n", "Simulated Annealing with Levy Distribution for Fast Matrix\n", " Factorization-Based Collaborative Filtering\n", "********************\n", "Mostafa A. Shehata, Mohammad Nassef, Amr A. Badr\n", "2017-08-09T15:14:54Z\n", "Matrix factorization is one of the best approaches for collaborative\n", "filtering, because of its high accuracy in presenting users and items latent\n", "factors. The main disadvantages of matrix factorization are its complexity, and\n", "being very hard to be parallelized, specially with very large matrices. In this\n", "paper, we introduce a new method for collaborative filtering based on Matrix\n", "Factorization by combining simulated annealing with levy distribution. By using\n", "this method, good solutions are achieved in acceptable time with low\n", "computations, compared to other methods like stochastic gradient descent,\n", "alternating least squares, and weighted non-negative matrix factorization.\n", "\n", "********************\n", "Primitive factorizations, Jucys-Murphy elements, and matrix models\n", "********************\n", "Sho Matsumoto, Jonathan Novak\n", "2010-05-02T17:46:10Z\n", "A factorization of a permutation into transpositions is called \"primitive\" if\n", "its factors are weakly ordered. We discuss the problem of enumerating primitive\n", "factorizations of permutations, and its place in the hierarchy of previously\n", "studied factorization problems. Several formulas enumerating minimal primitive\n", "and possibly non-minimal primitive factorizations are presented, and\n", "interesting connections with Jucys-Murphy elements, symmetric group characters,\n", "and matrix models are described.\n", "\n", "********************\n", "Localization of Matrix Factorizations\n", "********************\n", "Ilya Krishtal, Thomas Strohmer, Tim Wertz\n", "2013-05-07T19:55:06Z\n", "Matrices with off-diagonal decay appear in a variety of fields in mathematics\n", "and in numerous applications, such as signal processing, statistics,\n", "communications engineering, condensed matter physics, and quantum chemistry.\n", "Numerical algorithms dealing with such matrices often take advantage\n", "(implicitly or explicitly) of the empirical observation that this off-diagonal\n", "decay property seems to be preserved when computing various useful matrix\n", "factorizations, such as the Cholesky factorization or the QR-factorization.\n", "There is a fairly extensive theory describing when the inverse of a matrix\n", "inherits the localization properties of the original matrix. Yet, except for\n", "the special case of band matrices, surprisingly very little theory exists that\n", "would establish similar results for matrix factorizations. We will derive a\n", "comprehensive framework to rigorously answer the question when and under which\n", "conditions the matrix factors inherit the localization of the original matrix\n", "for such fundamental matrix factorizations as the LU-, QR-, Cholesky, and Polar\n", "factorization.\n", "\n", "********************\n", "Monotone thematic factorizations of matrix functions\n", "********************\n", "Alberto A. Condori\n", "2009-08-28T20:29:13Z\n", "We continue the study of the so-called thematic factorizations of admissible\n", "very badly approximable matrix functions. These factorizations were introduced\n", "by V.V. Peller and N.J. Young for studying superoptimal approximation by\n", "bounded analytic matrix functions. Even though thematic indices associated with\n", "a thematic factorization of an admissible very badly approximable matrix\n", "function are not uniquely determined by the function itself, R.B. Alexeev and\n", "V.V. Peller showed that the thematic indices of any monotone non-increasing\n", "thematic factorization of an admissible very badly approximable matrix function\n", "are uniquely determined. In this paper, we prove the existence of monotone\n", "non-decreasing thematic factorizations for admissible very badly approximable\n", "matrix functions. It is also shown that the thematic indices appearing in a\n", "monotone non-decreasing thematic factorization are not uniquely determined by\n", "the matrix function itself. Furthermore, we show that the monotone\n", "non-increasing thematic factorization gives rise to a great number of other\n", "thematic factorizations.\n", "\n", "********************\n", "Badly approximable matrix functions and canonical factorizations\n", "********************\n", "R. B. Alexeev, V. V. Peller\n", "2001-01-26T22:08:33Z\n", "We continue studying the problem of analytic approximation of matrix\n", "functions. We introduce the notion of a partial canonical factorization of a\n", "badly approximable matrix function $\\Phi$ and the notion of a canonical\n", "factorization of a very badly approximable matrix function $\\Phi$. Such\n", "factorizations are defined in terms of so-called balanced unitary-valued\n", "functions which have many remarkable properties. Unlike the case of thematic\n", "factorizations studied earlier in [PY1], [PY2], [PT], [AP1], the factors in\n", "canonical factorizations (as well as partial canonical factorizations) are\n", "uniquely determined by the matrix function $\\Phi$ up to constant unitary\n", "factors. We study many properties of canonical factorizations. In particular we\n", "show that under certain natural assumptions on a function space $X$ the\n", "condition $\\Phi\\in X$ implies that all factors in a canonical factorization of\n", "$\\Phi$ belong to the same space $X$. In the last section we characterize the\n", "very badly approximable unitary-valued functions $U$ that satisfy the condition\n", "$\\|H_U\\|_{\\text e}<1$.\n", "\n", "********************\n", "Nonnegative Matrix Factorization Requires Irrationality\n", "********************\n", "Dmitry Chistikov, Stefan Kiefer, Ines Marušić, Mahsa Shirmohammadi, James Worrell\n", "2017-03-22T22:03:17Z\n", "Nonnegative matrix factorization (NMF) is the problem of decomposing a given\n", "nonnegative $n \\times m$ matrix $M$ into a product of a nonnegative $n \\times\n", "d$ matrix $W$ and a nonnegative $d \\times m$ matrix $H$. A longstanding open\n", "question, posed by Cohen and Rothblum in 1993, is whether a rational matrix $M$\n", "always has an NMF of minimal inner dimension $d$ whose factors $W$ and $H$ are\n", "also rational. We answer this question negatively, by exhibiting a matrix for\n", "which $W$ and $H$ require irrational entries.\n", "\n", "********************\n", "Spectral Factorization of Rank-Deficient Polynomial Matrix-Functions\n", "********************\n", "Lasha Ephremidze, Edem Lagvilava\n", "2010-08-18T16:04:13Z\n", "A spectral factorization theorem is proved for polynomial rank-deficient\n", "matrix-functions. The theorem is used to construct paraunitary matrix-functions\n", "with first rows given.\n", "\n", "********************\n", "From-Below Approximations in Boolean Matrix Factorization: Geometry and\n", " New Algorithm\n", "********************\n", "Radim Belohlavek, Martin Trnecka\n", "2013-06-20T15:19:22Z\n", "We present new results on Boolean matrix factorization and a new algorithm\n", "based on these results. The results emphasize the significance of\n", "factorizations that provide from-below approximations of the input matrix.\n", "While the previously proposed algorithms do not consider the possibly different\n", "significance of different matrix entries, our results help measure such\n", "significance and suggest where to focus when computing factors. An experimental\n", "evaluation of the new algorithm on both synthetic and real data demonstrates\n", "its good performance in terms of good coverage by the first k factors as well\n", "as a small number of factors needed for exact decomposition and indicates that\n", "the algorithm outperforms the available ones in these terms. We also propose\n", "future research topics.\n", "\n", "********************\n", "Necessary And Sufficient Conditions For Existence of the LU\n", " Factorization of an Arbitrary Matrix\n", "********************\n", "Pavel Okunev, Charles R. Johnson\n", "2005-06-19T23:10:13Z\n", "If $A$ is an n-by-n matrix over a field $F$ ($A\\in M_{n}(F)$), then $A$ is\n", "said to ``have an LU factorization'' if there exists a lower triangular matrix\n", "$L\\in M_{n}(F)$ and an upper triangular matrix $U\\in M_{n}(F)$ such that\n", "$$A=LU.$$ We give necessary and sufficient conditions for LU factorability of a\n", "matrix. Also simple algorithm for computing an LU factorization is given. It is\n", "an extension of the Gaussian elimination algorithm to the case of not\n", "necessarily invertible matrices. We consider possibilities to factors a matrix\n", "that does not have an LU factorization as the product of an ``almost lower\n", "triangular'' matrix and an ``almost upper triangular'' matrix. There are many\n", "ways to formalize what almost means. We consider some of them and derive\n", "necessary and sufficient conditions. Also simple algorithms for computing of an\n", "``almost LU factorization'' are given.\n", "\n" ] } ], "source": [ "n_examples = 20\n", "\n", "for entry in results[:n_examples]:\n", " print(20 * '*')\n", " print(entry['title'])\n", " print(20 * '*')\n", " print(', '.join(entry['authors']))\n", " print(entry['date'])\n", " print(entry['summary'])\n", " print()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "articles_df = pd.DataFrame(results)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "articles_df = pd.read_json('matrix_factorization_arxiv_query_result.json')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['affiliation', 'arxiv_comment', 'arxiv_primary_category', 'arxiv_url',\n", " 'author', 'author_detail', 'authors', 'doi', 'guidislink', 'id',\n", " 'journal_reference', 'links', 'pdf_url', 'published',\n", " 'published_parsed', 'summary', 'summary_detail', 'tags', 'title',\n", " 'title_detail', 'updated', 'updated_parsed'],\n", " dtype='object')" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "articles_df.columns" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
affiliationarxiv_commentarxiv_primary_categoryarxiv_urlauthorauthor_detailauthorsdoiguidislinkid...pdf_urlpublishedpublished_parsedsummarysummary_detailtagstitletitle_detailupdatedupdated_parsed
0None22 pages, 4 figures, part of this work was pre...{'term': 'eess.SP', 'scheme': 'http://arxiv.or...http://arxiv.org/abs/1803.06234v1Koujin Takeda{'name': 'Koujin Takeda'}[Ryota Kawasumi, Koujin Takeda]NoneTruehttp://arxiv.org/abs/1803.06234v1...http://arxiv.org/pdf/1803.06234v12018-03-14T13:54:23Z(2018, 3, 14, 13, 54, 23, 2, 73, 0)We derive analytical expression of matrix fact...{'type': 'text/plain', 'language': None, 'base...[{'term': 'eess.SP', 'scheme': 'http://arxiv.o...Approximate Method of Variational Bayesian Mat...{'type': 'text/plain', 'language': None, 'base...2018-03-14T13:54:23Z(2018, 3, 14, 13, 54, 23, 2, 73, 0)
1None23 pages{'term': 'math.CV', 'scheme': 'http://arxiv.or...http://arxiv.org/abs/0909.5361v1Lasha Ephremidze{'name': 'Lasha Ephremidze'}[Gigla Janashia, Edem Lagvilava, Lasha Ephremi...NoneTruehttp://arxiv.org/abs/0909.5361v1...http://arxiv.org/pdf/0909.5361v12009-09-29T15:08:13Z(2009, 9, 29, 15, 8, 13, 1, 272, 0)A new method of matrix spectral factorization ...{'type': 'text/plain', 'language': None, 'base...[{'term': 'math.CV', 'scheme': 'http://arxiv.o...A New Method of Matrix Spectral Factorization{'type': 'text/plain', 'language': None, 'base...2009-09-29T15:08:13Z(2009, 9, 29, 15, 8, 13, 1, 272, 0)
2None6 pages{'term': 'math.CA', 'scheme': 'http://arxiv.or...http://arxiv.org/abs/1408.2611v1Paul W. Y. Lee{'name': 'Paul W. Y. Lee'}[Paul W. Y. Lee]NoneTruehttp://arxiv.org/abs/1408.2611v1...http://arxiv.org/pdf/1408.2611v12014-08-12T03:29:00Z(2014, 8, 12, 3, 29, 0, 1, 224, 0)We give proofs of QR factorization, Cholesky's...{'type': 'text/plain', 'language': None, 'base...[{'term': 'math.CA', 'scheme': 'http://arxiv.o...Matrix Factorizations via the Inverse Function...{'type': 'text/plain', 'language': None, 'base...2014-08-12T03:29:00Z(2014, 8, 12, 3, 29, 0, 1, 224, 0)
3NoneNone{'term': 'math.CO', 'scheme': 'http://arxiv.or...http://arxiv.org/abs/1405.6315v1Thomas M. Richardson{'name': 'Thomas M. Richardson'}[Thomas M. Richardson]NoneTruehttp://arxiv.org/abs/1405.6315v1...http://arxiv.org/pdf/1405.6315v12014-05-24T16:16:58Z(2014, 5, 24, 16, 16, 58, 5, 144, 0)The reciprocal Pascal matrix is the Hadamard i...{'type': 'text/plain', 'language': None, 'base...[{'term': 'math.CO', 'scheme': 'http://arxiv.o...The Reciprocal Pascal Matrix{'type': 'text/plain', 'language': None, 'base...2014-05-24T16:16:58Z(2014, 5, 24, 16, 16, 58, 5, 144, 0)
4None20 pages{'term': 'math.FA', 'scheme': 'http://arxiv.or...http://arxiv.org/abs/math/0101182v2V. V. Peller{'name': 'V. V. Peller'}[R. B. Alexeev, V. V. Peller]NoneTruehttp://arxiv.org/abs/math/0101182v2...http://arxiv.org/pdf/math/0101182v22001-01-22T23:32:55Z(2001, 1, 22, 23, 32, 55, 0, 22, 0)We study the problem of invariance of indices ...{'type': 'text/plain', 'language': None, 'base...[{'term': 'math.FA', 'scheme': 'http://arxiv.o...Invariance properties of thematic factorizatio...{'type': 'text/plain', 'language': None, 'base...2001-01-26T21:54:16Z(2001, 1, 26, 21, 54, 16, 4, 26, 0)
\n", "

5 rows × 22 columns

\n", "
" ], "text/plain": [ " affiliation arxiv_comment \\\n", "0 None 22 pages, 4 figures, part of this work was pre... \n", "1 None 23 pages \n", "2 None 6 pages \n", "3 None None \n", "4 None 20 pages \n", "\n", " arxiv_primary_category \\\n", "0 {'term': 'eess.SP', 'scheme': 'http://arxiv.or... \n", "1 {'term': 'math.CV', 'scheme': 'http://arxiv.or... \n", "2 {'term': 'math.CA', 'scheme': 'http://arxiv.or... \n", "3 {'term': 'math.CO', 'scheme': 'http://arxiv.or... \n", "4 {'term': 'math.FA', 'scheme': 'http://arxiv.or... \n", "\n", " arxiv_url author \\\n", "0 http://arxiv.org/abs/1803.06234v1 Koujin Takeda \n", "1 http://arxiv.org/abs/0909.5361v1 Lasha Ephremidze \n", "2 http://arxiv.org/abs/1408.2611v1 Paul W. Y. Lee \n", "3 http://arxiv.org/abs/1405.6315v1 Thomas M. Richardson \n", "4 http://arxiv.org/abs/math/0101182v2 V. V. Peller \n", "\n", " author_detail \\\n", "0 {'name': 'Koujin Takeda'} \n", "1 {'name': 'Lasha Ephremidze'} \n", "2 {'name': 'Paul W. Y. Lee'} \n", "3 {'name': 'Thomas M. Richardson'} \n", "4 {'name': 'V. V. Peller'} \n", "\n", " authors doi guidislink \\\n", "0 [Ryota Kawasumi, Koujin Takeda] None True \n", "1 [Gigla Janashia, Edem Lagvilava, Lasha Ephremi... None True \n", "2 [Paul W. Y. Lee] None True \n", "3 [Thomas M. Richardson] None True \n", "4 [R. B. Alexeev, V. V. Peller] None True \n", "\n", " id ... \\\n", "0 http://arxiv.org/abs/1803.06234v1 ... \n", "1 http://arxiv.org/abs/0909.5361v1 ... \n", "2 http://arxiv.org/abs/1408.2611v1 ... \n", "3 http://arxiv.org/abs/1405.6315v1 ... \n", "4 http://arxiv.org/abs/math/0101182v2 ... \n", "\n", " pdf_url published \\\n", "0 http://arxiv.org/pdf/1803.06234v1 2018-03-14T13:54:23Z \n", "1 http://arxiv.org/pdf/0909.5361v1 2009-09-29T15:08:13Z \n", "2 http://arxiv.org/pdf/1408.2611v1 2014-08-12T03:29:00Z \n", "3 http://arxiv.org/pdf/1405.6315v1 2014-05-24T16:16:58Z \n", "4 http://arxiv.org/pdf/math/0101182v2 2001-01-22T23:32:55Z \n", "\n", " published_parsed \\\n", "0 (2018, 3, 14, 13, 54, 23, 2, 73, 0) \n", "1 (2009, 9, 29, 15, 8, 13, 1, 272, 0) \n", "2 (2014, 8, 12, 3, 29, 0, 1, 224, 0) \n", "3 (2014, 5, 24, 16, 16, 58, 5, 144, 0) \n", "4 (2001, 1, 22, 23, 32, 55, 0, 22, 0) \n", "\n", " summary \\\n", "0 We derive analytical expression of matrix fact... \n", "1 A new method of matrix spectral factorization ... \n", "2 We give proofs of QR factorization, Cholesky's... \n", "3 The reciprocal Pascal matrix is the Hadamard i... \n", "4 We study the problem of invariance of indices ... \n", "\n", " summary_detail \\\n", "0 {'type': 'text/plain', 'language': None, 'base... \n", "1 {'type': 'text/plain', 'language': None, 'base... \n", "2 {'type': 'text/plain', 'language': None, 'base... \n", "3 {'type': 'text/plain', 'language': None, 'base... \n", "4 {'type': 'text/plain', 'language': None, 'base... \n", "\n", " tags \\\n", "0 [{'term': 'eess.SP', 'scheme': 'http://arxiv.o... \n", "1 [{'term': 'math.CV', 'scheme': 'http://arxiv.o... \n", "2 [{'term': 'math.CA', 'scheme': 'http://arxiv.o... \n", "3 [{'term': 'math.CO', 'scheme': 'http://arxiv.o... \n", "4 [{'term': 'math.FA', 'scheme': 'http://arxiv.o... \n", "\n", " title \\\n", "0 Approximate Method of Variational Bayesian Mat... \n", "1 A New Method of Matrix Spectral Factorization \n", "2 Matrix Factorizations via the Inverse Function... \n", "3 The Reciprocal Pascal Matrix \n", "4 Invariance properties of thematic factorizatio... \n", "\n", " title_detail updated \\\n", "0 {'type': 'text/plain', 'language': None, 'base... 2018-03-14T13:54:23Z \n", "1 {'type': 'text/plain', 'language': None, 'base... 2009-09-29T15:08:13Z \n", "2 {'type': 'text/plain', 'language': None, 'base... 2014-08-12T03:29:00Z \n", "3 {'type': 'text/plain', 'language': None, 'base... 2014-05-24T16:16:58Z \n", "4 {'type': 'text/plain', 'language': None, 'base... 2001-01-26T21:54:16Z \n", "\n", " updated_parsed \n", "0 (2018, 3, 14, 13, 54, 23, 2, 73, 0) \n", "1 (2009, 9, 29, 15, 8, 13, 1, 272, 0) \n", "2 (2014, 8, 12, 3, 29, 0, 1, 224, 0) \n", "3 (2014, 5, 24, 16, 16, 58, 5, 144, 0) \n", "4 (2001, 1, 26, 21, 54, 16, 4, 26, 0) \n", "\n", "[5 rows x 22 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "articles_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## arXiv categories" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "categories = articles_df['arxiv_primary_category'].apply(itemgetter('term'))\n", "\n", "main_categories = categories.apply(lambda s: s.split('.')[0].split('-')[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Toplevel categories" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "main_categories_counts = main_categories.value_counts(ascending=True)\n", "main_categories_counts.plot.barh()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Toplevel categories with more than 200 papers" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAD8CAYAAABdCyJkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAE/ZJREFUeJzt3XuwZWV95vHvQwfoRrC5NFKAxgNIRLnD0YnXEYYIohnHEZWJNaJJ2QXxEswwVWRISM8lE5OQqQkjkWodA1HGoBhnrFjM4CAXhQpON/Q1XAOdkUtBehCEWGBofvPHXh22h3Pr855z9j7b76dq1177Xe9e+/eydvfDu9baq1NVSJI0V7sNugBJ0tJmkEiSmhgkkqQmBokkqYlBIklqYpBIkpoYJJKkJgaJJKmJQSJJavIzgy5gvq1atarGxsYGXYYkLRnr16/fXlUHzvX9IxckY2NjrFu3btBlSNKSkeRvWt7voS1JUhODRJLUxCCRJDUxSCRJTQwSSVKTkbtq6+GHH2bNmjWDLkOSFs2g/85zRiJJamKQSJKaLGqQJBlLsmUxP1OStLCckUiSmgwiSJYl+VySrUmuS7IiyRFJ/meS9Um+k+QogCRXJLm8a7snybsGUK8kaRqDCJIjgcuq6mjgCeC9wFrgE1V1MnAB8Md9/ceAfwy8E7g8yfLFLVeSNJ1BXP77QFVt6JbX0wuKNwJfTbKzz559/b9SVc8D9ya5HzgK2NC3niSrgdUAK1euXLjKJUkvMoggebZveQdwEPBEVZ0wRf+a4TVVtZberIZDDjnkReslSQtnGE62/xB4IMn7ANJzfN/69yXZLckRwOHA3YMoUpI0uWEIEoAPAr+SZCOwFXh337q7gZuAa4Fzq+qZAdQnSZrCoh7aqqptwDF9ry/pW33GFG+7pao+tZB1SZLmblhmJJKkJWqob9pYVR8edA2SpOmlarQuchofHy//zXZJmr0k66tqfK7v99CWJKmJQSJJamKQSJKaGCSSpCYGiSSpiUEiSWpikEiSmhgkkqQmBokkqYlBIklqYpBIkpoYJJKkJkN999+5+PFDT/Pghd8ZdBmSNKWXf/otgy5hXjkjkSQ1MUgkSU2GJkiSnJDkzL7Xa5JcMMiaJEkzG5ogAU4AzpyxlyRpqMxrkCQZS3JXks8n2ZLkqiSnJbklyb1JXt89bk1yR/f86iR7AP8O+ECSDUk+0G3ytUluTHJ/kk/OZ62SpPmxEDOSVwF/BBwHHAX8EvBm4ALg3wB3AW+tqhOBi4H/WFU/7pavrqoTqurqbltHAacDrwd+O8nuC1CvJKnBQlz++0BVbQZIshW4vqoqyWZgDFgJXJnkSKCA6cLhm1X1LPBskseAg4AHJ3ZKshpYDXDoSw+az7FIkmawEDOSZ/uWn+97/Ty94Pr3wA1VdQzwi8DyWW5rB1MEX1Wtrarxqhrff69951y4JGnXDeJk+0rgoW75w33tTwH7LHo1kqQmgwiS3wd+N8ktwLK+9hvonVzvP9kuSRpy83qOpKq2Acf0vf7wFOt+ru9tv9Wtfxx43TTbPmaqdZKkwRmm35FIkpYgg0SS1GTk7v67x6F7j9ydNSVpmDkjkSQ1MUgkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUxCCRJDUxSCRJTQwSSVITg0SS1MQgkSQ1Gbm7/z56/3384QfeNegypGn9q6v/YtAlSPPGGYkkqYlBIklqYpBIkpoYJJKkJkMVJEk+lGRTko1JvpjkfUm2dK9vHnR9kqQXG5qrtpIcDVwEvKmqtifZH7gJOL2qHkqy7zTvXQ2sBthvrxWLUq8kqWeYZiSnAtdU1XaAqnocuAW4IslHgWVTvbGq1lbVeFWNv2TPPRanWkkSMFxBEqD6G6rqXOA3gVcAG5IcMIjCJElTG6YguR54/86wSLJ/kiOq6raquhjYTi9QJElDZGjOkVTV1iS/A9yUZAdwB/DSJEfSm61cD2wcZI2SpBcbmiABqKorgSsHXYckafaG6dCWJGkJGqoZyXw46PBXeUM8SVpEzkgkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUxCCRJDUxSCRJTQwSSVITg0SS1MQgkSQ1MUgkSU1G7u6/j/3NU1x27rcHXcZPtY9dfuqgS5C0iJyRSJKaGCSSpCZLJkiSXJHkrEHXIUn6SUsmSCRJw2nBgiTJh5JsSrIxyReTvDLJ9V3b9Ul+tut3RZJLk9ya5P6ds470fCbJXyX5JvCyhapVkjR3CxIkSY4GLgJOrarjgV8DPgP8aVUdB1wFXNr3loOBNwPvAj7dtb0HeDVwLPBR4I3TfN7qJOuSrHv6mSfmeziSpGks1IzkVOCaqtoOUFWPA28A/lu3/ov0gmOn/15Vz1fVXwEHdW1vBb5cVTuq6mFgymt6q2ptVY1X1fjey/ed77FIkqaxUEESoGbo07/+2QnvnayPJGkILVSQXA+8P8kBAEn2B24Fzu7WfxD47gzbuBk4O8myJAcDpyxQrZKkBgvyy/aq2prkd4CbkuwA7gA+CXwhyb8G/hb4yAyb+Tq9Q2SbgXuAmxaiVklSmwW7RUpVXQlcOaH5RffOqKoPT3i9d/dcwMcXqj5J0vzwdySSpCYjd9PGl71yH28aKEmLyBmJJKmJQSJJamKQSJKaGCSSpCYGiSSpiUEiSWpikEiSmhgkkqQmBokkqYlBIklqYpBIkpoYJJKkJgaJJKnJyN3995ktW7nzqNcMuowF95q77hx0CZIEOCORJDUySCRJTQYaJEnOT7LXfPWTJC2+Qc9IzgdmExCz7SdJWmSLdrI9yUuArwAvB5YBXwUOAW5Isr2qTknyWeB1wArgmqr67SSfnNhvsWqWJM1sMa/aOgN4uKreCZBkJfAR4JSq2t71uaiqHk+yDLg+yXFVdWmSX5/QT5I0JBbz0NZm4LQkv5fkLVX15CR93p/kduAO4GjgtbPZcJLVSdYlWff4jufmsWRJ0kwWbUZSVfckORk4E/jdJNf1r09yGHAB8Lqq+kGSK4Dls9z2WmAtwDHLV9S8Fi5JmtaizUiSHAL8qKq+BFwCnAQ8BezTdXkp8HfAk0kOAt7R9/b+fpKkIbKY50iOBf4gyfPA3wPnAW8Ark3ySHey/Q5gK3A/cEvfe9f291vEmiVJM0jVaB0JOmb5ivrq2Nigy1hw3iJF0nxJsr6qxuf6/kH/jkSStMSN3E0blx9zNK9Zt27QZUjSTw1nJJKkJgaJJKmJQSJJamKQSJKaGCSSpCYGiSSpiUEiSWpikEiSmhgkkqQmBokkqYlBIklqYpBIkpoYJJKkJiN399+t/28rx1557KDLYPM5mwddgiQtCmckkqQmBokkqcmSCJIk5yfZa9B1SJJebEkECXA+YJBI0hCalyBJclGSu5P87yRfTnJBkhuTjHfrVyXZ1i2PJflOktu7xxu79rd177kmyV1JrkrPJ4FDgBuS3DAf9UqS5k/zVVtJTgbOBk7stnc7sH6atzwG/EJVPZPkSODLwHi37kTgaOBh4BbgTVV1aZJfB06pqu2t9UqS5td8XP77FuDrVfUjgCTfmKH/7sBnkpwA7AB+rm/d96rqwW47G4Ax4LszFZBkNbAaYPcDdt/V+iVJDebrdyQ1SdtzvHDobHlf+6eAR4Hju/XP9K17tm95x2zrq6q1wFqAFYetmKwWSdICmY9zJDcD70myIsk+wC927duAk7vls/r6rwQeqarngX8JLJvFZzwF7DMPtUqS5llzkFTV7cDVwAbga8B3ulWXAOcluRVY1feWPwbOSfKX9A5r/d0sPmYtcK0n2yVp+KRqfo8EJVkDPF1Vl8zrhmdpxWEr6lVrXjWIj/4J3iJF0lKRZH1Vjc/cc3JL5XckkqQhNe83bayqNfO9TUnS8Bq5u/8efcDRrDtn3aDLkKSfGh7akiQ1MUgkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUxCCRJDUxSCRJTQwSSVITg0SS1GTkbtrIw3fAmpWD+ew1Tw7mcyVpgJyRSJKaGCSSpCZzDpIk25KsmrnntNsYT3JpyzYkSYM10HMkVbUO8F+hkqQlbMYZSZKxJHcluTLJpiTXJNmrW/2JJLcn2ZzkqCS7Jbk3yYHde3dLcl+SVUnel2RLko1Jbu7Wvy3JX3TLeyf5k25bm5K8N8myJFd079uc5FML9l9CkjQnsz209WpgbVUdB/wQ+NWufXtVnQR8Frigqp4HvgR8sFt/GrCxqrYDFwOnV9XxwD+d5DN+C3iyqo7tPufbwAnAoVV1TFUdC/zJrg9RkrSQZhsk36+qW7rlLwFv7pb/vHteD4x1y18APtQt/zIv/OV/C3BFko8Cyyb5jNOAy3a+qKofAPcDhyf5L0nOoBdiL5JkdZJ1Sdb97Y9qlkOSJM2H2QbJxL+dd75+tnveQXe+paq+Dzya5FTgHwHXdu3nAr8JvALYkOSACdvMxM/pwuR44EbgY8DnJy2uam1VjVfV+IF7ZZZDkiTNh9kGyc8meUO3/C+A787Q//P0Zi5fqaodAEmOqKrbqupiYDu9QOl3HfDxnS+S7NddFbZbVX2N3qGvk2ZZryRpkcw2SO4EzkmyCdif3jmR6XwD2JufPKfxB90J8y3AzcDGCe/5D8B+O0/IA6cAhwI3JtkAXAH8xizrlSQtktle/vt8d2iq39jOhe4y3rf1rTue3kn2u/r6/PNJtntj96CqngbOmaSPsxBJGmLz/juSJBcC5/HClVuSpBE246GtqtpWVcfMdoNV9emqemVVzXQeRZI0Akbv7r+HnAhr/LG8JC0Wb9ooSWpikEiSmhgkkqQmBokkqYlBIklqYpBIkpoYJJKkJgaJJKmJQSJJamKQSJKaGCSSpCYGiSSpycjdtHHzQ08yduE3F+Wztn36nYvyOZI0zJyRSJKaGCSSpCYGiSSpyZIJkiQ3JhkfdB2SpJ+0ZIJEkjScFjxIkowluTPJ55JsTXJdkhX9M4wkq5Js65aXJbkkyeYkm5J8YqFrlCTN3WLNSI4ELquqo4EngPdO03c1cBhwYlUdB1w108aTrE6yLsm6HT96cl4KliTNzmIFyQNVtaFbXg+MTdP3NODyqnoOoKoen2njVbW2qsaranzZXiubi5Ukzd5iBcmzfcs76P0Q8rm+z1/etz5ALVJdkqRGgzzZvg04uVs+q6/9OuDcJD8DkGT/Ra5LkrQLBhkklwDnJbkVWNXX/nng/wKbkmwEfmkQxUmSZidVo3UUac+Dj6yDz/nPi/JZ3mtL0ihIsr6q5vw7PX9HIklqMnJ3/z320JWsc6YgSYvGGYkkqYlBIklqYpBIkpoYJJKkJgaJJKmJQSJJajJyP0hM8hRw96DrWCCrgO2DLmKBOLalybEtTRPH9sqqOnCuGxu535EAd7f8QnOYJVnn2JYex7Y0ObbZ89CWJKmJQSJJajKKQbJ20AUsIMe2NDm2pcmxzdLInWyXJC2uUZyRSJIW0cgESZIzktyd5L4kFw66nrlIsi3J5iQbkqzr2vZP8q0k93bP+3XtSXJpN95NSU4abPU/KckXkjyWZEtf2y6PJck5Xf97k5wziLFMNMXY1iR5qNt3G5Kc2bfuN7qx3Z3k9L72ofvOJnlFkhuS3Jlka5Jf69qX/L6bZmyjsu+WJ/leko3d+P5t135Yktu6/XB1kj269j271/d168f6tjXpuKdUVUv+ASwD/ho4HNgD2Ai8dtB1zWEc24BVE9p+H7iwW74Q+L1u+UzgWnr/xv3PA7cNuv4Jdb8VOAnYMtexAPsD93fP+3XL+w3p2NYAF0zS97Xd93FP4LDue7psWL+zwMHASd3yPsA93RiW/L6bZmyjsu8C7N0t7w7c1u2TrwBnd+2XA+d1y78KXN4tnw1cPd24p/vsUZmRvB64r6rur6ofA38GvHvANc2XdwNXdstXAv+sr/1Pq+cvgX2THDyIAidTVTcDj09o3tWxnA58q6oer6ofAN8Czlj46qc3xdim8m7gz6rq2ap6ALiP3vd1KL+zVfVIVd3eLT8F3Akcygjsu2nGNpWltu+qqp7uXu7ePQo4Fbima5+473bu02uAf5IkTD3uKY1KkBwKfL/v9YNM/wUZVgVcl2R9ktVd20FV9Qj0/iAAL+val+KYd3UsS22MH+8O73xh56EflvDYukMdJ9L7P9uR2ncTxgYjsu+SLEuyAXiMXnj/NfBEVT3Xdemv9R/G0a1/EjiAOYxvVIIkk7QtxcvR3lRVJwHvAD6W5K3T9B2VMcPUY1lKY/wscARwAvAI8Idd+5IcW5K9ga8B51fVD6frOknbUI9vkrGNzL6rqh1VdQLwcnqziNdM1q17nrfxjUqQPAi8ou/1y4GHB1TLnFXVw93zY8DX6X0RHt15yKp7fqzrvhTHvKtjWTJjrKpHuz/EzwOf44VDAUtubEl2p/cX7VVV9edd80jsu8nGNkr7bqeqegK4kd45kn2T7LwdVn+t/zCObv1Keodsd3l8oxIk/wc4srs6YQ96J46+MeCadkmSlyTZZ+cy8HZgC71x7Lzi5Rzgf3TL3wA+1F018/PAkzsPPQyxXR3L/wLenmS/7nDD27u2oTPh/NR76O076I3t7O4KmcOAI4HvMaTf2e4Y+X8F7qyq/9S3asnvu6nGNkL77sAk+3bLK4DT6J0HugE4q+s2cd/t3KdnAd+u3tn2qcY9tUFfaTBfD3pXj9xD75jgRYOuZw71H07vSomNwNadY6B3zPJ64N7uef964QqNy7rxbgbGBz2GCeP5Mr3DBH9P7/9wfmUuYwF+md7JvvuAjwx6XNOM7Ytd7Zu6P4gH9/W/qBvb3cA7hvk7C7yZ3mGMTcCG7nHmKOy7acY2KvvuOOCObhxbgIu79sPpBcF9wFeBPbv25d3r+7r1h8807qke/rJdktRkVA5tSZIGxCCRJDUxSCRJTQwSSVITg0SS1MQgkSQ1MUgkSU0MEklSk/8Pa1XVc2H6yLgAAAAASUVORK5CYII=\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "main_categories_counts[main_categories_counts > 200].plot.barh()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": false }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "categories.value_counts(ascending=True)[-10:].plot.barh()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extract keywords from summaries" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 5.37 s, sys: 611 ms, total: 5.99 s\n", "Wall time: 1min 40s\n" ] } ], "source": [ "%%time\n", "\n", "articles_df['summary_keywords'] = list(\n", " map_parallel(extract_keywords, articles_df['summary'])\n", ")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "********************\n", "Approximate Method of Variational Bayesian Matrix\n", " Factorization/Completion with Sparse Prior\n", "********************\n", "keywords: ['matrix', 'analytical', 'bayes', 'completion']\n", "\n", "********************\n", "A New Method of Matrix Spectral Factorization\n", "********************\n", "keywords: ['spectral', 'method']\n", "\n", "********************\n", "Matrix Factorizations via the Inverse Function Theorem\n", "********************\n", "keywords: ['factorizations', 'function']\n", "\n", "********************\n", "The Reciprocal Pascal Matrix\n", "********************\n", "keywords: ['pascal matrix']\n", "\n", "********************\n", "Invariance properties of thematic factorizations of matrix functions\n", "********************\n", "keywords: ['thematic', 'results', 'superoptimal', 'matrix']\n", "\n", "********************\n", "Online Matrix Factorization via Broyden Updates\n", "********************\n", "keywords: ['matrix', 'algorithms', 'missing', 'objective', 'dataset']\n", "\n", "********************\n", "Matrix factorizations and intertwiners of the fundamental\n", " representations of quantum group U_q (sl_n)\n", "********************\n", "keywords: ['matrix', 'equivalences', 'link', 'paper', 'naturally', 'planar']\n", "\n", "********************\n", "Fundamental matrix factorization in the FJRW-theory revisited\n", "********************\n", "keywords: ['construction', 'matrix']\n", "\n", "********************\n", "Matrix factorizations and double line in $\\mathfrak{sl}_n$ quantum link\n", " invariant\n", "********************\n", "keywords: ['ref', 'quantum link', 'line', 'loop']\n", "\n", "********************\n", "Finiteness of small factor analysis models\n", "********************\n", "keywords: ['matrix parameter', 'factors', 'size']\n", "\n", "********************\n", "Stochastic Matrix Factorization\n", "********************\n", "keywords: ['paper', 'unstructured', 'data', 'factors', 'model']\n", "\n", "********************\n", "Simulated Annealing with Levy Distribution for Fast Matrix\n", " Factorization-Based Collaborative Filtering\n", "********************\n", "keywords: ['methods', 'computations', 'simulated', 'filtering', 'non', 'latent', 'descent', 'good']\n", "\n", "********************\n", "Primitive factorizations, Jucys-Murphy elements, and matrix models\n", "********************\n", "keywords: ['primitive', 'factorizations', 'enumerating', 'minimal']\n", "\n", "********************\n", "Localization of Matrix Factorizations\n", "********************\n", "keywords: ['matrix', 'factors', 'decay', 'extensive theory', 'numerical', 'similar', 'matrices', 'properties']\n", "\n", "********************\n", "Monotone thematic factorizations of matrix functions\n", "********************\n", "keywords: ['thematic', 'matrix', 'approximation', 'indices', 'non']\n", "\n", "********************\n", "Badly approximable matrix functions and canonical factorizations\n", "********************\n", "keywords: ['unitary', 'factors', 'approximable', 'function', 'study']\n", "\n", "********************\n", "Nonnegative Matrix Factorization Requires Irrationality\n", "********************\n", "keywords: ['question', 'matrix', 'irrational']\n", "\n", "********************\n", "Spectral Factorization of Rank-Deficient Polynomial Matrix-Functions\n", "********************\n", "keywords: ['matrix', 'factorization']\n", "\n", "********************\n", "From-Below Approximations in Boolean Matrix Factorization: Geometry and\n", " New Algorithm\n", "********************\n", "keywords: ['matrix', 'algorithms', 'different', 'factors', 'new results', 'propose']\n", "\n", "********************\n", "Necessary And Sufficient Conditions For Existence of the LU\n", " Factorization of an Arbitrary Matrix\n", "********************\n", "keywords: ['triangular', 'invertible', 'elimination']\n", "\n" ] } ], "source": [ "for __, row in itertools.islice(articles_df.iterrows(), n_examples):\n", " print(20 * '*')\n", " print(row['title'])\n", " print(20 * '*')\n", " print('keywords:', row['summary_keywords'])\n", " print()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "article_keyword_lengths = articles_df['summary_keywords'].apply(len)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "article_keyword_lengths.plot.hist(bins=article_keyword_lengths.max(), title='Number of summary keywords')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Try to predict tags given summaries" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(9403, 23)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "valid_examples, valid_example_categories = filter_out_small_categories(articles_df, main_categories)\n", "valid_examples.shape" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "vectorized_data, (ohe, le) = vectorize_text(valid_examples)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We'll use [Factorization Machine](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf) model. FMs are well suited for such high-dimensional sparse data. This model comes from [FastFM](https://github.com/ibayer/fastFM) library.\n", "\n", "Note `OneVsRestClassifier`. We have to use it here because FastFM doesn't support multiclass classification." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "fm = FMClassifier(\n", " rank=50,\n", " n_iter=10000,\n", " step_size=0.0001,\n", " l2_reg_w=0.01,\n", " l2_reg_V=0.01\n", ")\n", "fm_multiclass = OneVsRestClassifier(fm)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "x_train, x_test, y_train, y_test, y_train_labels, y_test_labels = train_test_split(\n", " vectorized_data['features'],\n", " vectorized_data['labels_onehot'],\n", " vectorized_data['labels'],\n", " stratify=vectorized_data['labels'],\n", " test_size=0.2,\n", " random_state=0\n", ")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 14 s, sys: 171 ms, total: 14.1 s\n", "Wall time: 12.5 s\n" ] }, { "data": { "text/plain": [ "OneVsRestClassifier(estimator=FMClassifier(init_stdev=0.1, l2_reg=None, l2_reg_V=0.01, l2_reg_w=0.01,\n", " n_iter=10000, random_state=123, rank=50, step_size=0.0001),\n", " n_jobs=1)" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "fm_multiclass.fit(x_train, y_train)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "y_test_pred = predict_ovr(fm_multiclass, x_test)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train score: 0.719090667376 \n", "test score: 0.658692185008\n" ] } ], "source": [ "print(\n", " 'train score:', accuracy_score(y_train_labels, predict_ovr(fm_multiclass, x_train)), '\\n'\n", " 'test score: ', accuracy_score(y_test_labels, y_test_pred)\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Confusion matrix" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " cond 0.52 0.61 0.56 176\n", " cs 0.53 0.60 0.56 309\n", " hep 0.82 0.86 0.84 579\n", " math 0.63 0.77 0.69 542\n", " nucl 0.62 0.30 0.41 50\n", " physics 0.33 0.02 0.03 57\n", " quant 0.53 0.16 0.25 62\n", " stat 0.54 0.07 0.12 106\n", "\n", "avg / total 0.64 0.66 0.63 1881\n", "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "report_classification_confusion_matrix(y_test_labels, y_test_pred, le)" ] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "nnets", "language": "python", "name": "nnets" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }