{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import scipy\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn import feature_extraction, datasets\n", "import numba" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "data = datasets.fetch_20newsgroups()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "texts = data.data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "vectorizer = feature_extraction.text.CountVectorizer(min_df=5)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "vectorized_texts = vectorizer.fit_transform(texts)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(11314, 25941)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vectorized_texts.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DATA REPRESENTATION\n", "\n", "We use Numba so we can't use sparse matrices\n", "\n", "Need to encode documents as dense vectors" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "y_indices, x_indices, __ = scipy.sparse.find(vectorized_texts)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def get_text_vector_indices(text_vectors):\n", " \"\"\"\n", " Convert sparse binary count vectorizer's matrix into dense matrix\n", " doc_word_indices[i, j] = w \n", " means that in document *i* word *j* has positon *w* in dictionary \n", " \"\"\"\n", " n_documents = text_vectors.shape[0]\n", " y_indices, x_indices, __ = scipy.sparse.find(text_vectors)\n", " max_doc_length = pd.DataFrame({'y': y_indices, 'x': x_indices}).groupby('x').agg('count').max()[0]\n", " doc_word_indices = -np.ones((n_documents, max_doc_length + 1), dtype='int32')\n", " for i in range(n_documents):\n", " word_indices = y_indices[x_indices == i]\n", " for j in range(len(word_indices)):\n", " doc_word_indices[i, j] = word_indices[j]\n", " return doc_word_indices " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 7.06 s, sys: 23.5 ms, total: 7.08 s\n", "Wall time: 7.08 s\n" ] } ], "source": [ "%%time\n", "text_vector_indices = get_text_vector_indices(vectorized_texts)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "\n", "@numba.jit(nopython=True)\n", "def count_words_in_topics(document_topics, text_vector_indices, topic_word_counts):\n", " for text in range(text_vector_indices.shape[0]):\n", " for i in text_vector_indices[text]:\n", " if i < 0:\n", " break\n", " topic_word_counts[document_topics[text], i] += 1\n", " return topic_word_counts\n", "\n", "\n", "def make_topics(text_vector_indices, num_topics):\n", " num_texts, __ = text_vector_indices.shape\n", " num_words = text_vector_indices.max()\n", " topic_document_counts = np.zeros((num_topics,)) # m_z\n", " topic_sizes = np.zeros((num_topics,)) # n_z\n", " topic_word_counts = np.zeros((num_topics, num_words+1)) # n^w_z\n", " \n", " text_lenghts = np.array((text_vector_indices >= 0).sum(axis=1))\n", " \n", " document_topics = np.random.randint(0, num_topics, size=num_texts)\n", " topic_word_counts = count_words_in_topics(document_topics, text_vector_indices, topic_word_counts)\n", " \n", " document_topics_ohe = np.zeros((num_texts, num_topics))\n", " document_topics_ohe[np.arange(num_texts), document_topics] = 1\n", " \n", " topic_document_counts += document_topics_ohe.sum(axis=0)\n", " \n", " topic_sizes = (text_lenghts[:, np.newaxis] * document_topics_ohe).sum(axis=0)\n", " return topic_document_counts, topic_sizes, topic_word_counts, document_topics" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 10, 59, 109, ..., -1, -1, -1],\n", " [ 13, 17, 68, ..., -1, -1, -1],\n", " [2064, 3665, 4191, ..., -1, -1, -1],\n", " ...,\n", " [ 70, 127, 760, ..., -1, -1, -1],\n", " [1908, 2998, 3867, ..., -1, -1, -1],\n", " [1830, 4516, 4881, ..., -1, -1, -1]], dtype=int32)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text_vector_indices" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 397 ms, sys: 20 ms, total: 417 ms\n", "Wall time: 417 ms\n" ] } ], "source": [ "%%time\n", "num_topics = 4\n", "topic_document_counts, topic_sizes, topic_word_counts, document_topics = make_topics(text_vector_indices, num_topics)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "@numba.jit(nopython=True)\n", "def sampling_distribution(doc_idx, topic_document_counts, topic_sizes, topic_word_counts, alpha, beta, epsilon=1e-10):\n", " num_topics = topic_sizes.shape[0]\n", " log_topic_document_counts = np.log(topic_document_counts + alpha)\n", " log_D = np.log(num_texts - 1 + num_topics * alpha)\n", " log_topic_word_counts = np.log(topic_word_counts + beta).sum(axis=0)\n", " log_topic_sizes = np.log(topic_sizes + num_words * beta + i - 1)\n", " return np.exp(log_topic_document_counts - log_D + log_topic_word_counts - log_topic_sizes)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "@numba.jit(nopython=True)\n", "def sampling_distribution(doc, alpha, beta, topic_document_counts, topic_sizes, topic_word_counts):\n", " \"\"\"\n", " sampling distribution for document doc given m_z, n_z, n^w_z\n", " formula (3) from paper\n", " \"\"\"\n", " D, maxsize = text_vector_indices.shape\n", " K, V = topic_word_counts.shape\n", " m_z, n_z, n_z_w = topic_document_counts, topic_sizes, topic_word_counts \n", "\n", " log_p = np.zeros((K,)) \n", "\n", " # We break the formula into the following pieces\n", " # p = N1*N2/(D1*D2) = exp(lN1 - lD1 + lN2 - lD2)\n", " # lN1 = np.log(m_z[z] + alpha)\n", " # lN2 = np.log(D - 1 + K*alpha)\n", " # lN2 = np.log(product(n_z_w[w] + beta)) = sum(np.log(n_z_w[w] + beta))\n", " # lD2 = np.log(product(n_z[d] + V*beta + i -1)) = sum(np.log(n_z[d] + V*beta + i -1))\n", "\n", " lD1 = np.log(D - 1 + K * alpha)\n", " for label in range(K):\n", " lN1 = np.log(m_z[label] + alpha)\n", " lN2 = 0\n", " lD2 = 0\n", " doc_size = 0\n", " for word in doc:\n", " if word < 0:\n", " break\n", " lN2 += np.log(n_z_w[label, word] + beta)\n", " doc_size += 1\n", " for j in range(1, doc_size +1):\n", " lD2 += np.log(n_z[label] + V * beta + j - 1)\n", " log_p[label] = lN1 - lD1 + lN2 - lD2\n", " \n", " log_p = log_p - log_p.max() / 2\n", " p = np.exp(log_p)\n", " # normalize the probability vector\n", " pnorm = p.sum()\n", " pnorm = pnorm if pnorm>0 else 1\n", " return p / pnorm " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4,)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "topic_document_counts.shape" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4,)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "topic_sizes.shape" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4, 11314)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "topic_word_counts.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%timeit\n", "sampling_distribution(text_vector_indices[0], alpha, beta, topic_document_counts, topic_sizes, topic_word_counts)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "num_topics = 100\n", "V = text_vector_indices.max()\n", "K = num_topics\n", "D = len(text_vector_indices)\n" ] }, { "cell_type": "code", "execution_count": 215, "metadata": {}, "outputs": [], "source": [ "alpha = 0.1\n", "beta = 0.1" ] }, { "cell_type": "code", "execution_count": 216, "metadata": {}, "outputs": [], "source": [ "topic_document_counts, topic_sizes, topic_word_counts, document_topics = make_topics(text_vector_indices, num_topics)" ] }, { "cell_type": "code", "execution_count": 217, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "43 141\n", "45 138\n", "64 133\n", "2 133\n", "23 130\n", " ... \n", "59 94\n", "71 91\n", "26 90\n", "14 89\n", "36 87\n", "Length: 100, dtype: int64" ] }, "execution_count": 217, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.Series(document_topics).value_counts()" ] }, { "cell_type": "code", "execution_count": 218, "metadata": {}, "outputs": [], "source": [ "@numba.jit(nopython=True)\n", "def update_topic(doc, topic, topic_sizes, topic_document_counts, topic_word_counts, update_int):\n", " topic_sizes[topic] += update_int\n", " topic_document_counts[topic] += update_int\n", " for w in doc:\n", " if w < 0:\n", " break\n", " topic_word_counts[topic][w] += update_int\n", "\n", "\n", "def step(text_vector_indices, alpha, beta, topic_document_counts, topic_sizes, topic_word_counts, document_topics):\n", " D, maxsize = text_vector_indices.shape\n", " K, V = topic_word_counts.shape\n", " for d in range(D):\n", " doc = text_vector_indices[d]\n", " # update old\n", " previous_cluster = document_topics[d]\n", " update_topic(doc, previous_cluster, topic_sizes, topic_document_counts, topic_word_counts, -1)\n", " # sample\n", " p = sampling_distribution(doc, alpha, beta, topic_document_counts, topic_sizes, topic_word_counts)\n", " new_cluster = np.argmax(np.random.multinomial(1, p))\n", " document_topics[d] = new_cluster\n", " # update new\n", " update_topic(doc, previous_cluster, topic_sizes, topic_document_counts, topic_word_counts, 1)" ] }, { "cell_type": "code", "execution_count": 219, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 11.7 s, sys: 63.7 ms, total: 11.7 s\n", "Wall time: 11.7 s\n" ] } ], "source": [ "%%time\n", "text_vector_indices = get_text_vector_indices(vectorized_texts)\n", "topic_document_counts, topic_sizes, topic_word_counts, document_topics = make_topics(text_vector_indices, num_topics)\n", "step(text_vector_indices, alpha, beta, topic_document_counts, topic_sizes, topic_word_counts, document_topics)" ] }, { "cell_type": "code", "execution_count": 221, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 93., 100., 128., 116., 117., 108., 115., 103., 106., 105., 119.,\n", " 133., 121., 120., 119., 111., 121., 91., 124., 125., 118., 103.,\n", " 117., 129., 119., 121., 112., 109., 119., 115., 123., 96., 134.,\n", " 121., 118., 121., 125., 107., 118., 106., 122., 126., 104., 104.,\n", " 115., 115., 103., 117., 101., 124., 115., 103., 93., 110., 115.,\n", " 113., 100., 125., 106., 102., 121., 102., 97., 118., 123., 104.,\n", " 99., 100., 119., 113., 113., 119., 105., 126., 124., 116., 111.,\n", " 117., 151., 126., 112., 130., 110., 120., 96., 124., 107., 104.,\n", " 104., 99., 111., 89., 106., 124., 108., 104., 121., 110., 111.,\n", " 101.])" ] }, "execution_count": 221, "metadata": {}, "output_type": "execute_result" } ], "source": [ "topic_document_counts" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "import gsdmm" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "mgp = gsdmm.MovieGroupProcess(K=8, alpha=0.1, beta=0.1, n_iters=1)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "In stage 0: transferred 9905 clusters with 8 clusters populated\n", "CPU times: user 12min 53s, sys: 54.3 ms, total: 12min 53s\n", "Wall time: 12min 53s\n" ] } ], "source": [ "%%time\n", "y = mgp.fit(texts, vocab_size=V)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 100x speedup! " ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "126.625" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(16 * 60 + 53) / 8" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "ml", "language": "python", "name": "ml" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.1" } }, "nbformat": 4, "nbformat_minor": 2 }