{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Biclustering documents with the Spectral Co-clustering algorithm\n\nThis example demonstrates the Spectral Co-clustering algorithm on the\ntwenty newsgroups dataset. The 'comp.os.ms-windows.misc' category is\nexcluded because it contains many posts containing nothing but data.\n\nThe TF-IDF vectorized posts form a word frequency matrix, which is\nthen biclustered using Dhillon's Spectral Co-Clustering algorithm. The\nresulting document-word biclusters indicate subsets words used more\noften in those subsets documents.\n\nFor a few of the best biclusters, its most common document categories\nand its ten most important words get printed. The best biclusters are\ndetermined by their normalized cut. The best words are determined by\ncomparing their sums inside and outside the bicluster.\n\nFor comparison, the documents are also clustered using\nMiniBatchKMeans. The document clusters derived from the biclusters\nachieve a better V-measure than clusters found by MiniBatchKMeans.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\nfrom collections import Counter\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster import MiniBatchKMeans, SpectralCoclustering\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\n\ndef number_normalizer(tokens):\n \"\"\"Map all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n return (\"#NUMBER\" if token[0].isdigit() else token for token in tokens)\n\n\nclass NumberNormalizingVectorizer(TfidfVectorizer):\n def build_tokenizer(self):\n tokenize = super().build_tokenizer()\n return lambda doc: list(number_normalizer(tokenize(doc)))\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = [\n \"alt.atheism\",\n \"comp.graphics\",\n \"comp.sys.ibm.pc.hardware\",\n \"comp.sys.mac.hardware\",\n \"comp.windows.x\",\n \"misc.forsale\",\n \"rec.autos\",\n \"rec.motorcycles\",\n \"rec.sport.baseball\",\n \"rec.sport.hockey\",\n \"sci.crypt\",\n \"sci.electronics\",\n \"sci.med\",\n \"sci.space\",\n \"soc.religion.christian\",\n \"talk.politics.guns\",\n \"talk.politics.mideast\",\n \"talk.politics.misc\",\n \"talk.religion.misc\",\n]\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = NumberNormalizingVectorizer(stop_words=\"english\", min_df=5)\ncocluster = SpectralCoclustering(\n n_clusters=len(categories), svd_method=\"arpack\", random_state=0\n)\nkmeans = MiniBatchKMeans(\n n_clusters=len(categories), batch_size=20000, random_state=0, n_init=3\n)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\n f\"Done in {time() - start_time:.2f}s. V-measure: \\\n{v_measure_score(y_cocluster, y_true):.4f}\"\n)\n\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\n f\"Done in {time() - start_time:.2f}s. V-measure: \\\n{v_measure_score(y_kmeans, y_true):.4f}\"\n)\n\n\nfeature_names = vectorizer.get_feature_names_out()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = X[row_complement][:, cols].sum() + X[rows][:, col_complement].sum()\n return cut / weight\n\n\nbicluster_ncuts = list(bicluster_ncut(i) for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = Counter(document_names[doc] for doc in cluster_docs)\n\n cat_string = \", \".join(\n f\"{(c / n_rows * 100):.0f}% {name}\" for name, c in counter.most_common(3)\n )\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(\n word_col[cluster_docs, :].sum(axis=0)\n - word_col[out_of_cluster_docs, :].sum(axis=0)\n )\n word_scores = word_scores.ravel()\n important_words = list(\n feature_names[cluster_words[i]] for i in word_scores.argsort()[:-11:-1]\n )\n\n print(f\"bicluster {idx} : {n_rows} documents, {n_cols} words\")\n print(f\"categories : {cat_string}\")\n print(f\"words : {', '.join(important_words)}\\n\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 0 }