In [1]:
from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

In [3]:
categories = [
 'alt.atheism',
 'talk.religion.misc',
 'comp.graphics',
 'sci.space',
]
# Uncomment the following to do the analysis on all the categories
#categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

dataset = fetch_20newsgroups(subset='all', categories=categories,
 shuffle=True, random_state=42)

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

labels = dataset.target
true_k = np.unique(labels).shape[0]

print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
 min_df=2, stop_words='english',
 use_idf=opts.use_idf)
X = vectorizer.fit_transform(dataset.data)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

if opts.n_components:
 print("Performing dimensionality reduction using LSA")
 t0 = time()
 # Vectorizer results are normalized, which makes KMeans behave as
 # spherical k-means for better results. Since LSA/SVD results are
 # not normalized, we have to redo the normalization.
 svd = TruncatedSVD(opts.n_components)
 lsa = make_pipeline(svd, Normalizer(copy=False))

 X = lsa.fit_transform(X)

 print("done in %fs" % (time() - t0))

 explained_variance = svd.explained_variance_ratio_.sum()
 print("Explained variance of the SVD step: {}%".format(
 int(explained_variance * 100)))

 print()


###############################################################################
# Do the actual clustering

if opts.minibatch:
 km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
 init_size=1000, batch_size=1000, verbose=opts.verbose)
else:
 km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
 verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
 % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
 % metrics.silhouette_score(X, labels, sample_size=1000))

print()

if not (opts.n_components or opts.use_hashing):
 print("Top terms per cluster:")
 order_centroids = km.cluster_centers_.argsort()[:, ::-1]
 terms = vectorizer.get_feature_names()
 for i in range(true_k):
 print("Cluster %d:" % i, end='')
 for ind in order_centroids[i, :10]:
 print(' %s' % terms[ind], end='')
 print()

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
3387 documents
4 categories

Extracting features from the training dataset using a sparse vectorizer


NameError: name 'opts' is not defined