\n",
"
scipy.sparse._csr.csr_matrix
def __init__(arg1, shape=None, dtype=None, copy=False)
/usr/local/lib/python3.10/dist-packages/scipy/sparse/_csr.pyCompressed Sparse Row matrix\n",
"\n",
"This can be instantiated in several ways:\n",
" csr_array(D)\n",
" with a dense matrix or rank-2 ndarray D\n",
"\n",
" csr_array(S)\n",
" with another sparse matrix S (equivalent to S.tocsr())\n",
"\n",
" csr_array((M, N), [dtype])\n",
" to construct an empty matrix with shape (M, N)\n",
" dtype is optional, defaulting to dtype='d'.\n",
"\n",
" csr_array((data, (row_ind, col_ind)), [shape=(M, N)])\n",
" where ``data``, ``row_ind`` and ``col_ind`` satisfy the\n",
" relationship ``a[row_ind[k], col_ind[k]] = data[k]``.\n",
"\n",
" csr_array((data, indices, indptr), [shape=(M, N)])\n",
" is the standard CSR representation where the column indices for\n",
" row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their\n",
" corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.\n",
" If the shape parameter is not supplied, the matrix dimensions\n",
" are inferred from the index arrays.\n",
"\n",
"Attributes\n",
"----------\n",
"dtype : dtype\n",
" Data type of the matrix\n",
"shape : 2-tuple\n",
" Shape of the matrix\n",
"ndim : int\n",
" Number of dimensions (this is always 2)\n",
"nnz\n",
" Number of stored values, including explicit zeros\n",
"data\n",
" CSR format data array of the matrix\n",
"indices\n",
" CSR format index array of the matrix\n",
"indptr\n",
" CSR format index pointer array of the matrix\n",
"has_sorted_indices\n",
" Whether indices are sorted\n",
"\n",
"Notes\n",
"-----\n",
"\n",
"Sparse matrices can be used in arithmetic operations: they support\n",
"addition, subtraction, multiplication, division, and matrix power.\n",
"\n",
"Advantages of the CSR format\n",
" - efficient arithmetic operations CSR + CSR, CSR * CSR, etc.\n",
" - efficient row slicing\n",
" - fast matrix vector products\n",
"\n",
"Disadvantages of the CSR format\n",
" - slow column slicing operations (consider CSC)\n",
" - changes to the sparsity structure are expensive (consider LIL or DOK)\n",
"\n",
"Canonical Format\n",
" - Within each row, indices are sorted by column.\n",
" - There are no duplicate entries.\n",
"\n",
"Examples\n",
"--------\n",
"\n",
">>> import numpy as np\n",
">>> from scipy.sparse import csr_array\n",
">>> csr_array((3, 4), dtype=np.int8).toarray()\n",
"array([[0, 0, 0, 0],\n",
" [0, 0, 0, 0],\n",
" [0, 0, 0, 0]], dtype=int8)\n",
"\n",
">>> row = np.array([0, 0, 1, 2, 2, 2])\n",
">>> col = np.array([0, 2, 2, 0, 1, 2])\n",
">>> data = np.array([1, 2, 3, 4, 5, 6])\n",
">>> csr_array((data, (row, col)), shape=(3, 3)).toarray()\n",
"array([[1, 0, 2],\n",
" [0, 0, 3],\n",
" [4, 5, 6]])\n",
"\n",
">>> indptr = np.array([0, 2, 3, 6])\n",
">>> indices = np.array([0, 2, 2, 0, 1, 2])\n",
">>> data = np.array([1, 2, 3, 4, 5, 6])\n",
">>> csr_array((data, indices, indptr), shape=(3, 3)).toarray()\n",
"array([[1, 0, 2],\n",
" [0, 0, 3],\n",
" [4, 5, 6]])\n",
"\n",
"Duplicate entries are summed together:\n",
"\n",
">>> row = np.array([0, 1, 2, 0])\n",
">>> col = np.array([0, 1, 1, 0])\n",
">>> data = np.array([1, 2, 4, 8])\n",
">>> csr_array((data, (row, col)), shape=(3, 3)).toarray()\n",
"array([[9, 0, 0],\n",
" [0, 2, 0],\n",
" [0, 4, 0]])\n",
"\n",
"As an example of how to construct a CSR matrix incrementally,\n",
"the following snippet builds a term-document matrix from texts:\n",
"\n",
">>> docs = [["hello", "world", "hello"], ["goodbye", "cruel", "world"]]\n",
">>> indptr = [0]\n",
">>> indices = []\n",
">>> data = []\n",
">>> vocabulary = {}\n",
">>> for d in docs:\n",
"... for term in d:\n",
"... index = vocabulary.setdefault(term, len(vocabulary))\n",
"... indices.append(index)\n",
"... data.append(1)\n",
"... indptr.append(len(indices))\n",
"...\n",
">>> csr_array((data, indices, indptr), dtype=int).toarray()\n",
"array([[2, 1, 0, 0],\n",
" [0, 1, 1, 1]])
\n",
" \n",
"
"
]
},
"metadata": {},
"execution_count": 39
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(2000, 1000)"
]
},
"metadata": {},
"execution_count": 39
},
{
"output_type": "stream",
"name": "stdout",
"text": [
" (0, 708)\t0.12621877625178227\n",
" (0, 410)\t0.11650651629173196\n",
" (0, 493)\t0.1631127602376565\n",
" (0, 548)\t0.11873384536901997\n",
" (0, 130)\t0.13595955391213657\n",
" (0, 567)\t0.13595955391213657\n",
" (0, 412)\t0.12831668397369733\n",
" (0, 750)\t0.15376128408643466\n",
" (0, 841)\t0.18564440175793037\n",
" (0, 206)\t0.15810189392327795\n",
" (0, 764)\t0.1640284908630232\n",
" (0, 748)\t0.13595955391213657\n",
" (0, 904)\t0.08983671288492111\n",
" (0, 923)\t0.11966934266418663\n",
" (0, 527)\t0.1690393571774018\n",
" (0, 432)\t0.13369075280946802\n",
" (0, 988)\t0.12740095334833063\n",
" (0, 488)\t0.3750048191807266\n",
" (0, 717)\t0.17767638066823058\n",
" (0, 587)\t0.6454209423982519\n",
" (0, 862)\t0.1551447391479567\n",
" (0, 286)\t0.11115911128919416\n",
" (0, 867)\t0.15810189392327795\n",
" (0, 881)\t0.11227372176926384\n",
" (1, 381)\t0.20157910011124136\n",
" :\t:\n",
" (1998, 504)\t0.04875543232365812\n",
" (1998, 991)\t0.053978162418983656\n",
" (1998, 566)\t0.03637572081429063\n",
" (1998, 611)\t0.05504978412016225\n",
" (1998, 171)\t0.047384737904817335\n",
" (1998, 414)\t0.08876861152823663\n",
" (1998, 268)\t0.23575826480007847\n",
" (1998, 491)\t0.1114848475886964\n",
" (1998, 271)\t0.05622767285588837\n",
" (1998, 907)\t0.06818500433590943\n",
" (1998, 710)\t0.05998220148907317\n",
" (1998, 998)\t0.04605022195294345\n",
" (1998, 173)\t0.10248793661244614\n",
" (1998, 122)\t0.05810140044184461\n",
" (1998, 984)\t0.0397488592737133\n",
" (1998, 533)\t0.05951387738098097\n",
" (1998, 306)\t0.030847223209189208\n",
" (1998, 540)\t0.12852849537452227\n",
" (1998, 130)\t0.04971790762820881\n",
" (1998, 750)\t0.05622767285588837\n",
" (1998, 286)\t0.04064884201283483\n",
" (1999, 738)\t0.5707845186348437\n",
" (1999, 366)\t0.56500361648845\n",
" (1999, 356)\t0.44578463121221495\n",
" (1999, 286)\t0.3952872489933768\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Use tf (raw term count) features for LDA.\n",
"print(\"Extracting tf features for LDA...\")\n",
"tf_vectorizer = CountVectorizer(\n",
" max_df=0.95, min_df=2, max_features=n_features, stop_words=\"english\"\n",
")\n",
"t0 = time()\n",
"tf = tf_vectorizer.fit_transform(data_samples)\n",
"print(\"done in %0.3fs.\" % (time() - t0))\n",
"print()\n",
"\n",
"# Fit the NMF model\n",
"print(\n",
" \"Fitting the NMF model (Frobenius norm) with tf-idf features, \"\n",
" \"n_samples=%d and n_features=%d...\" % (n_samples, n_features)\n",
")\n",
"t0 = time()\n",
"nmf = NMF(\n",
" n_components=n_components,\n",
" random_state=1,\n",
" init=init,\n",
" beta_loss=\"frobenius\",\n",
" alpha_W=0.00005,\n",
" alpha_H=0.00005,\n",
" l1_ratio=1,\n",
").fit(tfidf)\n",
"print(\"done in %0.3fs.\" % (time() - t0))\n",
"\n",
"\n",
"tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\n",
"plot_top_words(\n",
" nmf, tfidf_feature_names, n_top_words, \"Topics in NMF model (Frobenius norm)\"\n",
")\n",
"\n",
"# Fit the NMF model\n",
"print(\n",
" \"\\n\" * 2,\n",
" \"Fitting the NMF model (generalized Kullback-Leibler \"\n",
" \"divergence) with tf-idf features, n_samples=%d and n_features=%d...\"\n",
" % (n_samples, n_features),\n",
")\n",
"t0 = time()\n",
"nmf = NMF(\n",
" n_components=n_components,\n",
" random_state=1,\n",
" init=init,\n",
" beta_loss=\"kullback-leibler\",\n",
" solver=\"mu\",\n",
" max_iter=1000,\n",
" alpha_W=0.00005,\n",
" alpha_H=0.00005,\n",
" l1_ratio=0.5,\n",
").fit(tfidf)\n",
"print(\"done in %0.3fs.\" % (time() - t0))\n",
"\n",
"tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\n",
"plot_top_words(\n",
" nmf,\n",
" tfidf_feature_names,\n",
" n_top_words,\n",
" \"Topics in NMF model (generalized Kullback-Leibler divergence)\",\n",
")\n",
"\n",
"# Fit the MiniBatchNMF model\n",
"print(\n",
" \"\\n\" * 2,\n",
" \"Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf \"\n",
" \"features, n_samples=%d and n_features=%d, batch_size=%d...\"\n",
" % (n_samples, n_features, batch_size),\n",
")\n",
"t0 = time()\n",
"mbnmf = MiniBatchNMF(\n",
" n_components=n_components,\n",
" random_state=1,\n",
" batch_size=batch_size,\n",
" init=init,\n",
" beta_loss=\"frobenius\",\n",
" alpha_W=0.00005,\n",
" alpha_H=0.00005,\n",
" l1_ratio=0.5,\n",
").fit(tfidf)\n",
"print(\"done in %0.3fs.\" % (time() - t0))\n",
"\n",
"\n",
"tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\n",
"plot_top_words(\n",
" mbnmf,\n",
" tfidf_feature_names,\n",
" n_top_words,\n",
" \"Topics in MiniBatchNMF model (Frobenius norm)\",\n",
")\n",
"\n",
"# Fit the MiniBatchNMF model\n",
"print(\n",
" \"\\n\" * 2,\n",
" \"Fitting the MiniBatchNMF model (generalized Kullback-Leibler \"\n",
" \"divergence) with tf-idf features, n_samples=%d and n_features=%d, \"\n",
" \"batch_size=%d...\" % (n_samples, n_features, batch_size),\n",
")\n",
"t0 = time()\n",
"mbnmf = MiniBatchNMF(\n",
" n_components=n_components,\n",
" random_state=1,\n",
" batch_size=batch_size,\n",
" init=init,\n",
" beta_loss=\"kullback-leibler\",\n",
" alpha_W=0.00005,\n",
" alpha_H=0.00005,\n",
" l1_ratio=0.5,\n",
").fit(tfidf)\n",
"print(\"done in %0.3fs.\" % (time() - t0))\n",
"\n",
"tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()\n",
"plot_top_words(\n",
" mbnmf,\n",
" tfidf_feature_names,\n",
" n_top_words,\n",
" \"Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)\",\n",
")\n",
"\n",
"print(\n",
" \"\\n\" * 2,\n",
" \"Fitting LDA models with tf features, n_samples=%d and n_features=%d...\"\n",
" % (n_samples, n_features),\n",
")\n",
"lda = LatentDirichletAllocation(\n",
" n_components=n_components,\n",
" max_iter=5,\n",
" learning_method=\"online\",\n",
" learning_offset=50.0,\n",
" random_state=0,\n",
")\n",
"t0 = time()\n",
"lda.fit(tf)\n",
"print(\"done in %0.3fs.\" % (time() - t0))\n",
"\n",
"tf_feature_names = tf_vectorizer.get_feature_names_out()\n",
"plot_top_words(lda, tf_feature_names, n_top_words, \"Topics in LDA model\")"
],
"metadata": {
"id": "y5kCqfaZPJlH"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "rH7Sxc727Dsa"
},
"source": [
"# Source: https://github.com/ekochmar/Getting-Started-with-NLP/blob/master/Chapter10.ipynb Chapter 10: LDA for Topic Modeling"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "zf5yraXr7Dsd"
},
"source": [
"## Load Newsgroups data\n",
"\n",
"As before, let's consider a specific set of categories:\n",
"https://scikit-learn.org/stable/datasets/real_world.html#the-20-newsgroups-text-dataset\n",
"\n",
"The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.\n",
"\n",
"This module contains two loaders. The first one, sklearn.datasets.fetch_20newsgroups, returns a list of the raw texts that can be fed to text feature extractors such as CountVectorizer with custom parameters so as to extract feature vectors. The second one, sklearn.datasets.fetch_20newsgroups_vectorized, returns ready-to-use features, i.e., it is not necessary to use a feature extractor.\n"
]
},
{
"cell_type": "code",
"source": [
"from IPython.core.interactiveshell import InteractiveShell\n",
"InteractiveShell.ast_node_interactivity = \"all\""
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "uwopWBE79MmQ",
"outputId": "35c61f45-61b9-4e31-f0da-5af78dc2397b"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_p6sGJzp7Dse",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "4a232a54-f058-4f01-d123-c61eef350b07"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"from sklearn.datasets import fetch_20newsgroups\n",
"\n",
"def load_dataset(sset, cats):\n",
" if cats==[]:\n",
" newsgroups_dset = fetch_20newsgroups(subset=sset,\n",
" remove=('headers', 'footers', 'quotes'),\n",
" shuffle=True)\n",
" else:\n",
" newsgroups_dset = fetch_20newsgroups(subset=sset, categories=cats,\n",
" remove=('headers', 'footers', 'quotes'),\n",
" shuffle=True)\n",
" return newsgroups_dset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "TXB2ImNP7Dsf",
"outputId": "d73b19a1-768f-4d25-8b96-c0ef5e004035",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"9850\n"
]
}
],
"source": [
"categories = [\"comp.windows.x\", \"misc.forsale\", \"rec.autos\", \"rec.motorcycles\", \"rec.sport.baseball\"]\n",
"categories += [\"rec.sport.hockey\", \"sci.crypt\", \"sci.med\", \"sci.space\", \"talk.politics.mideast\"]\n",
"\n",
"newsgroups_all = load_dataset('all', categories)\n",
"print(len(newsgroups_all.data))"
]
},
{
"cell_type": "code",
"source": [
"newsgroups_all.keys()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qigi1mT893mf",
"outputId": "6c4c2d41-297b-4f6f-ebbd-75e028f77e4e"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])"
]
},
"metadata": {},
"execution_count": 44
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "boMhBphQ7Dsg"
},
"source": [
"## Preprocess\n",
"\n",
"Convert word forms to stems to get concise representations for the documents:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "EysSi8EE7Dsg",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "f35807f2-07a2-4a02-e0a5-70dde8a78699"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"import nltk\n",
"from nltk.stem import SnowballStemmer\n",
"\n",
"stemmer = SnowballStemmer(\"english\")\n",
"\n",
"def stem(text):\n",
" return stemmer.stem(text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "E60KV4C97Dsh",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "ddfc7825-4e74-4083-f4a3-a626e01aa192"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"import gensim\n",
"from gensim.utils import simple_preprocess\n",
"from gensim.parsing.preprocessing import STOPWORDS as stopwords\n",
"\n",
"#print(stopwords)\n",
"\n",
"def preprocess(text):\n",
" result = []\n",
" for token in gensim.utils.simple_preprocess(text, min_len=4):\n",
" if token not in stopwords: #and len(token) > 3:\n",
" result.append(stem(token))\n",
" return result"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "aak2xZFK7Dsh"
},
"source": [
"Check how each document is represented. For example, look into the very first one:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "orjYOM_p7Dsh",
"outputId": "c8081d70-aaf6-432a-977f-7bf816c245c7",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Original document: \n",
"Hi Xperts!\n",
"\n",
"How can I move the cursor with the keyboard (i.e. cursor keys), \n",
"if no mouse is available?\n",
"\n",
"Any hints welcome.\n",
"\n",
"Thanks.\n",
"\n",
"\n",
"Tokenized document: \n",
"['Hi', 'Xperts', 'How', 'can', 'I', 'move', 'the', 'cursor', 'with', 'the', 'keyboard', 'i', 'e', 'cursor', 'keys', 'if', 'no', 'mouse', 'is', 'available', 'Any', 'hints', 'welcome', 'Thanks']\n",
"\n",
"\n",
"Preprocessed document: \n",
"['xpert', 'cursor', 'keyboard', 'cursor', 'key', 'mous', 'avail', 'hint', 'welcom', 'thank']\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"doc_sample = newsgroups_all.data[0]\n",
"print('Original document: ')\n",
"print(doc_sample)\n",
"\n",
"print('\\n\\nTokenized document: ')\n",
"words = []\n",
"for token in gensim.utils.tokenize(doc_sample):\n",
" words.append(token)\n",
"print(words)\n",
"\n",
"print('\\n\\nPreprocessed document: ')\n",
"print(preprocess(doc_sample))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "NiCIijFm7Dsi"
},
"source": [
"How do the first 10 look like?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "osvVgf0v7Dsi",
"outputId": "80fb67f0-759c-4117-fe9c-6ce3a12cbab4",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"0\txpert, cursor, keyboard, cursor, key, mous, avail, hint, welcom, thank\n",
"1\tobtain, copi, open, look, widget, obtain, need, order, copi, thank\n",
"2\tright, signal, strong, live, west, philadelphia, perfect, sport, fan, dream\n",
"3\tcanadian, thing, coach, boston, bruin, colorado, rocki, summari, post, gather\n",
"4\theck, feel, like, time, includ, cafeteria, work, half, time, headach\n",
"5\tdamn, right, late, climb, meet, morn, bother, right, foot, asleep\n",
"6\tolympus, stylus, pocket, camera, smallest, class, includ, time, date, stamp\n",
"7\tinclud, follow, chmos, clock, generat, driver, processor, chmos, eras, prom\n",
"8\tchang, intel, discov, xclient, xload, longer, work, bomb, messag, error\n",
"9\ttermin, like, power, server, run, window, manag, special, client, program\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"for i in range(0, 10):\n",
" print(str(i) + \"\\t\" + \", \".join(preprocess(newsgroups_all.data[i])[:10]))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SEczJc0-7Dsi"
},
"source": [
"Now let's represent each document as a dictionary of relevant words. Each word (*value* in the dictionary) has a unique identifier (*key*):"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "-l0QbJo67Dsi",
"outputId": "3f9811f2-2e57-4e32-f788-12e4092fc323",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"9850\n",
"39350\n",
"0 avail\n",
"1 cursor\n",
"2 hint\n",
"3 key\n",
"4 keyboard\n",
"5 mous\n",
"6 thank\n",
"7 welcom\n",
"8 xpert\n",
"9 copi\n"
]
}
],
"source": [
"processed_docs = []\n",
"for i in range(0, len(newsgroups_all.data)):\n",
" processed_docs.append(preprocess(newsgroups_all.data[i]))\n",
"\n",
"print(len(processed_docs))\n",
"\n",
"dictionary = gensim.corpora.Dictionary(processed_docs)\n",
"print(len(dictionary))\n",
"\n",
"index = 0\n",
"for key, value in dictionary.iteritems():\n",
" print(key, value)\n",
" index += 1\n",
" if index > 9:\n",
" break"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "R4x81BlW7Dsj"
},
"source": [
"Put some contraints on the dictionary of terms: for instance, keep up to $100,000$ words that occur more frequently than $10$ times (`no_below`) and less frequently than in $50\\%$ of the documents (`no_above`). This should help you extract the most useful terms, while still keeping a reasonable number of them."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Wu7j2fAV7Dsj",
"outputId": "fc5f931f-fe6d-48a1-e474-12c26e1be593",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"5868\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)\n",
"print(len(dictionary))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "a2DiDMqp7Dsj"
},
"source": [
"Let's see how a particular document is represented in this dictionary: for example, look into the very first post, or into the 100th:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "90r1HUsV7Dsj",
"outputId": "3f0b1504-9ba0-49b9-b743-7d0ebc59bec3",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]"
]
},
"metadata": {},
"execution_count": 51
}
],
"source": [
"bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n",
"bow_corpus[0]\n",
"#bow_corpus[99]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "pGHKdL997Dsj"
},
"source": [
"Let's decode what each index (key) in this dictionary points to:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Gqsy9Ywn7Dsj",
"outputId": "790da678-b0c2-4d92-fe6e-bc8435aa5bed",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Key 0 =\"avail\": occurrences=1\n",
"Key 1 =\"cursor\": occurrences=2\n",
"Key 2 =\"hint\": occurrences=1\n",
"Key 3 =\"key\": occurrences=1\n",
"Key 4 =\"keyboard\": occurrences=1\n",
"Key 5 =\"mous\": occurrences=1\n",
"Key 6 =\"thank\": occurrences=1\n",
"Key 7 =\"welcom\": occurrences=1\n",
"Key 8 =\"xpert\": occurrences=1\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"#bow_doc = bow_corpus[99]\n",
"bow_doc = bow_corpus[0]\n",
"\n",
"for i in range(len(bow_doc)):\n",
" print(f\"Key {bow_doc[i][0]} =\\\"{dictionary[bow_doc[i][0]]}\\\":\\\n",
" occurrences={bow_doc[i][1]}\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "CWcrenAD7Dsk"
},
"source": [
"## Train an LDA model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "D6K7YVND7Dsk",
"outputId": "d5f850f9-db9d-4a1e-b469-43a51587a73c",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Topic: 0 \n",
"Words: 0.021*\"encrypt\" + 0.018*\"secur\" + 0.018*\"chip\" + 0.016*\"govern\" + 0.013*\"clipper\" + 0.012*\"public\" + 0.010*\"privaci\" + 0.010*\"key\" + 0.010*\"phone\" + 0.009*\"algorithm\"\n",
"Topic: 1 \n",
"Words: 0.017*\"appear\" + 0.014*\"copi\" + 0.013*\"cover\" + 0.013*\"star\" + 0.013*\"book\" + 0.011*\"penalti\" + 0.010*\"black\" + 0.009*\"comic\" + 0.008*\"blue\" + 0.008*\"green\"\n",
"Topic: 2 \n",
"Words: 0.031*\"window\" + 0.015*\"server\" + 0.012*\"program\" + 0.012*\"file\" + 0.012*\"applic\" + 0.012*\"display\" + 0.011*\"widget\" + 0.010*\"version\" + 0.010*\"motif\" + 0.010*\"support\"\n",
"Topic: 3 \n",
"Words: 0.015*\"space\" + 0.007*\"launch\" + 0.007*\"year\" + 0.007*\"medic\" + 0.006*\"patient\" + 0.006*\"orbit\" + 0.006*\"research\" + 0.006*\"diseas\" + 0.005*\"develop\" + 0.005*\"nasa\"\n",
"Topic: 4 \n",
"Words: 0.018*\"armenian\" + 0.011*\"peopl\" + 0.008*\"kill\" + 0.008*\"said\" + 0.007*\"turkish\" + 0.006*\"muslim\" + 0.006*\"jew\" + 0.006*\"govern\" + 0.005*\"state\" + 0.005*\"greek\"\n",
"Topic: 5 \n",
"Words: 0.024*\"price\" + 0.021*\"sale\" + 0.020*\"offer\" + 0.017*\"drive\" + 0.017*\"sell\" + 0.016*\"includ\" + 0.013*\"ship\" + 0.013*\"interest\" + 0.011*\"ask\" + 0.010*\"condit\"\n",
"Topic: 6 \n",
"Words: 0.018*\"mail\" + 0.016*\"list\" + 0.015*\"file\" + 0.015*\"inform\" + 0.013*\"send\" + 0.012*\"post\" + 0.012*\"avail\" + 0.010*\"request\" + 0.010*\"program\" + 0.009*\"includ\"\n",
"Topic: 7 \n",
"Words: 0.019*\"like\" + 0.016*\"know\" + 0.011*\"time\" + 0.011*\"look\" + 0.010*\"think\" + 0.008*\"want\" + 0.008*\"thing\" + 0.008*\"good\" + 0.007*\"go\" + 0.007*\"bike\"\n",
"Topic: 8 \n",
"Words: 0.033*\"game\" + 0.022*\"team\" + 0.017*\"play\" + 0.015*\"year\" + 0.013*\"player\" + 0.011*\"season\" + 0.008*\"hockey\" + 0.008*\"score\" + 0.007*\"leagu\" + 0.007*\"goal\"\n",
"Topic: 9 \n",
"Words: 0.013*\"peopl\" + 0.012*\"think\" + 0.011*\"like\" + 0.009*\"time\" + 0.009*\"right\" + 0.009*\"israel\" + 0.009*\"know\" + 0.006*\"reason\" + 0.006*\"point\" + 0.006*\"thing\"\n"
]
}
],
"source": [
"# Create the dictionary\n",
"id2word = dictionary\n",
"\n",
"# Create the corpus with word frequencies\n",
"corpus = bow_corpus\n",
"\n",
"# Build the LDA model\n",
"lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,\n",
" id2word=id2word,\n",
" num_topics=10,\n",
" random_state=100,\n",
" update_every=1,\n",
" chunksize=1000,\n",
" passes=10,\n",
" alpha='symmetric',\n",
" iterations=100,\n",
" per_word_topics=True)\n",
"\n",
"\n",
"for index, topic in lda_model.print_topics(-1):\n",
" print(f\"Topic: {index} \\nWords: {topic}\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HBheSrFS7Dsk"
},
"source": [
"## Interpret the results\n",
"\n",
"What is the most representative topic in each document?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "u18dKneK7Dsk",
"outputId": "99fd90e8-b298-446c-af95-851100114ed4",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
" ID Main Topic Contribution (%) Keywords Snippet \n",
" 0 2 0.8268 window, server, program, file, applic\n",
" ['xpert', 'cursor', 'keyboard', 'cursor', 'key', 'mous', 'avail', 'hint'] \n",
" 1 6 0.4741 mail, list, file, inform, send\n",
" ['obtain', 'copi', 'open', 'look', 'widget', 'obtain', 'need', 'order'] \n",
" 2 7 0.4230 like, know, time, look, think\n",
" ['right', 'signal', 'strong', 'live', 'west', 'philadelphia', 'perfect', 'sport'] \n",
" 3 8 0.4159 game, team, play, year, player\n",
" ['canadian', 'thing', 'coach', 'boston', 'bruin', 'colorado', 'rocki', 'summari'] \n",
" 4 9 0.9039 peopl, think, like, time, right\n",
" ['heck', 'feel', 'like', 'time', 'includ', 'cafeteria', 'work', 'half'] \n",
" 5 7 0.6291 like, know, time, look, think\n",
" ['damn', 'right', 'late', 'climb', 'meet', 'morn', 'bother', 'right'] \n",
" 6 3 0.3485 space, launch, year, medic, patient\n",
" ['olympus', 'stylus', 'pocket', 'camera', 'smallest', 'class', 'includ', 'time'] \n",
" 7 5 0.3799 price, sale, offer, drive, sell\n",
" ['includ', 'follow', 'chmos', 'clock', 'generat', 'driver', 'processor', 'chmos'] \n",
" 8 2 0.7943 window, server, program, file, applic\n",
" ['chang', 'intel', 'discov', 'xclient', 'xload', 'longer', 'work', 'bomb'] \n",
" 9 2 0.6383 window, server, program, file, applic\n",
" ['termin', 'like', 'power', 'server', 'run', 'window', 'manag', 'special'] \n"
]
}
],
"source": [
"def analyse_topics(ldamodel, corpus, texts):\n",
" main_topic = {}\n",
" percentage = {}\n",
" keywords = {}\n",
" text_snippets = {}\n",
" # Get main topic in each document\n",
" for i, topic_list in enumerate(ldamodel[corpus]):\n",
" #print(\"\\n\")\n",
" #print(topic_list)\n",
" #print(\"\\n\")\n",
" #for i in range(0, len(topic_list)):\n",
" # print (topic_list[i])\n",
" topic = topic_list[0] if ldamodel.per_word_topics else topic_list\n",
" #print(topic)\n",
" topic = sorted(topic, key=lambda x: (x[1]), reverse=True)\n",
" # Get the main topic, contribution (%) and keywords for each document\n",
" for j, (topic_num, prop_topic) in enumerate(topic):\n",
" if j == 0: # => dominant topic\n",
" wp = ldamodel.show_topic(topic_num)\n",
" topic_keywords = \", \".join([word for word, prop in wp[:5]])\n",
" main_topic[i] = int(topic_num)\n",
" percentage[i] = round(prop_topic,4)\n",
" keywords[i] = topic_keywords\n",
" text_snippets[i] = texts[i][:8]\n",
" else:\n",
" break\n",
" return main_topic, percentage, keywords, text_snippets\n",
"\n",
"\n",
"main_topic, percentage, keywords, text_snippets = analyse_topics(\n",
" lda_model, bow_corpus, processed_docs)\n",
"\n",
"indexes = []\n",
"rows = []\n",
"for i in range(0, 10):\n",
" indexes.append(i)\n",
"rows.append(['ID', 'Main Topic', 'Contribution (%)', 'Keywords', 'Snippet'])\n",
"\n",
"for idx in indexes:\n",
" rows.append([str(idx), f\"{main_topic.get(idx)}\",\n",
" f\"{percentage.get(idx):.4f}\",\n",
" f\"{keywords.get(idx)}\\n\",\n",
" f\"{text_snippets.get(idx)}\"])\n",
"\n",
"columns = zip(*rows)\n",
"column_widths = [max(len(item) for item in col) for col in columns]\n",
"for row in rows:\n",
" print(''.join(' {:{width}} '.format(row[i], width=column_widths[i])\n",
" for i in range(0, len(row))))\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "VMc5ux2t7Dsk"
},
"source": [
"## Explore words and topics with pyLDAvis"
]
},
{
"cell_type": "code",
"source": [
"!pip install pyLDAvis"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EUp8QHbvAwHn",
"outputId": "ca8bf56a-ecc5-4809-f8ca-7d6717f60b77"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: pyLDAvis in /usr/local/lib/python3.10/dist-packages (3.4.1)\n",
"Requirement already satisfied: numpy>=1.24.2 in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (1.25.2)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (1.11.4)\n",
"Requirement already satisfied: pandas>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (2.0.3)\n",
"Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (1.4.2)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (3.1.4)\n",
"Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (2.10.0)\n",
"Requirement already satisfied: funcy in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (2.0)\n",
"Requirement already satisfied: scikit-learn>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (1.2.2)\n",
"Requirement already satisfied: gensim in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (4.3.2)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from pyLDAvis) (67.7.2)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas>=2.0.0->pyLDAvis) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=2.0.0->pyLDAvis) (2023.4)\n",
"Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=2.0.0->pyLDAvis) (2024.1)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0.0->pyLDAvis) (3.5.0)\n",
"Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim->pyLDAvis) (6.4.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->pyLDAvis) (2.1.5)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas>=2.0.0->pyLDAvis) (1.16.0)\n"
]
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "c1KxmDhs7Dsk",
"outputId": "bef78cc5-142d-4479-8b77-aa6232fe12f8",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 916
}
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"PreparedData(topic_coordinates= x y topics cluster Freq\n",
"topic \n",
"9 -0.071377 -0.158078 1 1 16.071927\n",
"7 0.021286 -0.150129 2 1 15.558590\n",
"4 0.043482 -0.124777 3 1 14.496566\n",
"3 -0.061378 0.005043 4 1 12.560903\n",
"8 0.209139 -0.183924 5 1 9.811112\n",
"6 -0.159857 0.164965 6 1 9.367518\n",
"2 -0.185643 0.098444 7 1 7.512238\n",
"0 -0.170320 -0.041934 8 1 6.917601\n",
"5 0.083449 0.192866 9 1 4.629165\n",
"1 0.291219 0.197524 10 1 3.074380, topic_info= Term Freq Total Category logprob loglift\n",
"123 game 2208.000000 2208.000000 Default 30.0000 30.0000\n",
"273 window 1629.000000 1629.000000 Default 29.0000 29.0000\n",
"598 armenian 1712.000000 1712.000000 Default 28.0000 28.0000\n",
"86 team 1451.000000 1451.000000 Default 27.0000 27.0000\n",
"254 mail 1397.000000 1397.000000 Default 26.0000 26.0000\n",
"... ... ... ... ... ... ...\n",
"311 forc 152.259346 683.518669 Topic10 -4.8725 1.9804\n",
"693 white 98.456731 321.460693 Topic10 -5.3085 2.2988\n",
"468 issu 108.238154 724.231507 Topic10 -5.2138 1.5813\n",
"130 left 100.041159 792.443258 Topic10 -5.2925 1.4125\n",
"1087 earth 91.108224 416.682392 Topic10 -5.3860 1.9618\n",
"\n",
"[648 rows x 6 columns], token_table= Topic Freq Term\n",
"term \n",
"1713 2 0.994027 accid\n",
"97 1 0.591889 actual\n",
"97 2 0.241280 actual\n",
"97 4 0.002513 actual\n",
"97 5 0.049010 actual\n",
"... ... ... ...\n",
"156 6 0.034544 year\n",
"156 8 0.002348 year\n",
"156 9 0.040916 year\n",
"2740 4 0.991196 yeast\n",
"5438 1 0.990100 zionist\n",
"\n",
"[1523 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[10, 8, 5, 4, 9, 7, 3, 1, 6, 2])"
],
"text/html": [
"\n",
"