{ "metadata": { "name": "", "signature": "sha256:0850bc3815eb59c20b84d98d83310aec6f8103361657c859105eff8d45f49845" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "\n", "from sklearn.datasets import fetch_20newsgroups\n", "from pprint import pprint\n", "\n", "import sys\n", "from time import time\n", "\n", "import numpy as np\n", "%pylab inline" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "WARNING: pylab import has clobbered these variables: ['clf', 'e']\n", "`%matplotlib` prevents importing * from pylab and numpy\n" ] } ], "prompt_number": 91 }, { "cell_type": "code", "collapsed": false, "input": [ "dataset = fetch_20newsgroups(subset='train')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "print(\"%d documents\" % len(dataset.data))\n", "print(\"%d categories\" % len(dataset.target_names))\n", "print()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "11314 documents\n", "20 categories\n", "()\n" ] } ], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "labels = dataset.target\n", "true_k = np.unique(labels).shape[0]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english',use_idf=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 47 }, { "cell_type": "code", "collapsed": false, "input": [ "t0 = time()\n", "X = vectorizer.fit_transform(dataset.data)\n", "\n", "print(\"done in %fs\" % (time() - t0))\n", "print(\"n_samples: %d, n_features: %d\" % X.shape)\n", "print()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "done in 3.360153s\n", "n_samples: 11314, n_features: 56122\n", "()\n" ] } ], "prompt_number": 52 }, { "cell_type": "code", "collapsed": false, "input": [ "X.shape" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 67, "text": [ "(11314, 56122)" ] } ], "prompt_number": 67 }, { "cell_type": "code", "collapsed": false, "input": [ "X" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 71, "text": [ "<11314x56122 sparse matrix of type ''\n", "\twith 1180487 stored elements in Compressed Sparse Row format>" ] } ], "prompt_number": 71 }, { "cell_type": "code", "collapsed": false, "input": [ "import scipy.sparse" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 72 }, { "cell_type": "code", "collapsed": false, "input": [ "scipy.sparse.issparse(X)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 75, "text": [ "True" ] } ], "prompt_number": 75 }, { "cell_type": "code", "collapsed": false, "input": [ "scipy.sparse.isspmatrix_csr(X)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 78, "text": [ "True" ] } ], "prompt_number": 78 }, { "cell_type": "code", "collapsed": false, "input": [ "XT = scipy.sparse.csr_matrix.transpose(X)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 83 }, { "cell_type": "code", "collapsed": false, "input": [ "XT.shape" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 85, "text": [ "(56122, 11314)" ] } ], "prompt_number": 85 }, { "cell_type": "code", "collapsed": false, "input": [ "A =XT*X" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 87 }, { "cell_type": "code", "collapsed": false, "input": [ "A.shape" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 88, "text": [ "(56122, 56122)" ] } ], "prompt_number": 88 }, { "cell_type": "code", "collapsed": false, "input": [ "e = scipy.sparse.linalg.eigsh(A,10)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 113 }, { "cell_type": "code", "collapsed": false, "input": [ "e" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "A.shape" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 123, "text": [ "(56122, 56122)" ] } ], "prompt_number": 123 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.preprocessing import Normalizer\n", "from sklearn import metrics\n", "\n", "from sklearn.cluster import KMeans, MiniBatchKMeans" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 124 }, { "cell_type": "code", "collapsed": false, "input": [ " km = KMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=1,verbose=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 125 }, { "cell_type": "code", "collapsed": false, "input": [ "km.fit(X)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Initialization complete\n", "Iteration 0, inertia 21125.031" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 1, inertia 11015.579" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 2, inertia 10962.996" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 3, inertia 10937.959" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 4, inertia 10924.570" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 5, inertia 10917.009" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 6, inertia 10910.716" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 7, inertia 10903.537" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 8, inertia 10897.802" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 9, inertia 10896.361" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 10, inertia 10895.680" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 11, inertia 10895.077" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 12, inertia 10894.385" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 13, inertia 10893.487" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 14, inertia 10892.448" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 15, inertia 10891.038" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 16, inertia 10890.493" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 17, inertia 10890.295" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 18, inertia 10890.204" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 19, inertia 10890.135" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 20, inertia 10890.029" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 21, inertia 10889.974" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 22, inertia 10889.936" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Iteration 23, inertia 10889.916" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Converged at iteration 23\n" ] }, { "metadata": {}, "output_type": "pyout", "prompt_number": 126, "text": [ "KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=20, n_init=1,\n", " n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,\n", " verbose=True)" ] } ], "prompt_number": 126 }, { "cell_type": "code", "collapsed": false, "input": [ "print(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\n", "print(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\n", "print(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\n", "print(\"Adjusted Rand-Index: %.3f\"\n", " % metrics.adjusted_rand_score(labels, km.labels_))\n", "print(\"Silhouette Coefficient: %0.3f\"\n", " % metrics.silhouette_score(X, labels, sample_size=1000))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Homogeneity: 0.365\n", "Completeness: 0.508\n", "V-measure: 0.425\n", "Adjusted Rand-Index: 0.085\n", "Silhouette Coefficient: 0.006" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 127 }, { "cell_type": "code", "collapsed": false, "input": [ "print(\"Top terms per cluster:\")\n", "order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n", "terms = vectorizer.get_feature_names()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Top terms per cluster:\n" ] } ], "prompt_number": 134 }, { "cell_type": "code", "collapsed": false, "input": [ "terms" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 135, "text": [ "[u'00',\n", " u'000',\n", " u'0000',\n", " u'00000',\n", " u'00000000',\n", " u'0000000004',\n", " u'0000000005',\n", " u'0000001200',\n", " u'000005102000',\n", " u'000021',\n", " u'0000vec',\n", " u'0001',\n", " u'000152',\n", " u'0002',\n", " u'000359',\n", " u'00041032',\n", " u'000413',\n", " u'0004422',\n", " u'0005',\n", " u'0005111312',\n", " u'0005111312na1em',\n", " u'000531',\n", " u'00072',\n", " u'0009',\n", " u'000k',\n", " u'000th',\n", " u'001',\n", " u'0010',\n", " u'00100111b',\n", " u'0010580b',\n", " u'0011',\n", " u'001116',\n", " u'001127',\n", " u'001230',\n", " u'0013',\n", " u'001319',\n", " u'001321',\n", " u'001428',\n", " u'001555',\n", " u'001642',\n", " u'001707',\n", " u'001718',\n", " u'001813',\n", " u'002',\n", " u'0020',\n", " u'002118',\n", " u'0022',\n", " u'002222',\n", " u'002251w',\n", " u'002302',\n", " u'0028',\n", " u'0029',\n", " u'002937',\n", " u'003',\n", " u'003015',\n", " u'003258u19250',\n", " u'0033',\n", " u'003522',\n", " u'003719',\n", " u'003749',\n", " u'004',\n", " u'0044',\n", " u'004418',\n", " u'0049',\n", " u'005',\n", " u'005131',\n", " u'005245',\n", " u'005314',\n", " u'0059',\n", " u'0060',\n", " u'0062',\n", " u'0065',\n", " u'0068',\n", " u'007',\n", " u'0076',\n", " u'0078',\n", " u'008',\n", " u'0086',\n", " u'009',\n", " u'0096b0f0',\n", " u'0096b11b',\n", " u'0096b294',\n", " u'0098',\n", " u'00am',\n", " u'00bjgood',\n", " u'00ecgillespi',\n", " u'00ecgillespie',\n", " u'00mbstultz',\n", " u'00pm',\n", " u'00r',\n", " u'01',\n", " u'010',\n", " u'0100',\n", " u'01000000b',\n", " u'01001111b',\n", " u'01002',\n", " u'01003',\n", " u'010235',\n", " u'010329',\n", " u'010734',\n", " u'010821',\n", " u'0109',\n", " u'011',\n", " u'011042',\n", " u'0111',\n", " u'0112',\n", " u'0114',\n", " u'011720',\n", " u'011805',\n", " u'011823',\n", " u'011855',\n", " u'012',\n", " u'0123456789',\n", " u'0126',\n", " u'013',\n", " u'013037',\n", " u'013651',\n", " u'0138',\n", " u'013939',\n", " u'014',\n", " u'01420',\n", " u'014237',\n", " u'015',\n", " u'015415',\n", " u'015442',\n", " u'0158',\n", " u'01580',\n", " u'016',\n", " u'01609',\n", " u'0164',\n", " u'01701',\n", " u'01720',\n", " u'01730',\n", " u'01752',\n", " u'01803',\n", " u'01821',\n", " u'0183',\n", " u'0184',\n", " u'01854',\n", " u'018b',\n", " u'01a',\n", " u'01wb',\n", " u'02',\n", " u'020',\n", " u'0200',\n", " u'020347',\n", " u'020356',\n", " u'020359',\n", " u'0205',\n", " u'020504',\n", " u'0209',\n", " u'021021',\n", " u'02106',\n", " u'02115',\n", " u'02118',\n", " u'02138',\n", " u'02139',\n", " u'02142',\n", " u'02154',\n", " u'02172',\n", " u'02173',\n", " u'022',\n", " u'022113',\n", " u'02215',\n", " u'022218',\n", " u'022222',\n", " u'0223',\n", " u'02238',\n", " u'0226',\n", " u'022922',\n", " u'022926',\n", " u'023017',\n", " u'023044',\n", " u'0235',\n", " u'023730',\n", " u'023b',\n", " u'024',\n", " u'024036',\n", " u'024103',\n", " u'024222',\n", " u'024246',\n", " u'024423',\n", " u'0245',\n", " u'02451203',\n", " u'025',\n", " u'025027',\n", " u'025240',\n", " u'025426',\n", " u'025818u28037',\n", " u'025924',\n", " u'02678944',\n", " u'02903',\n", " u'02917',\n", " u'02p',\n", " u'02tl',\n", " u'02tm_',\n", " u'02uv',\n", " u'03',\n", " u'030',\n", " u'0300',\n", " u'030031',\n", " u'0303',\n", " u'030412',\n", " u'030636',\n", " u'031',\n", " u'031349',\n", " u'031423',\n", " u'031616',\n", " u'0318',\n", " u'031823',\n", " u'032017',\n", " u'032022',\n", " u'032251',\n", " u'032345',\n", " u'032623',\n", " u'032828',\n", " u'033',\n", " u'0330',\n", " u'033446',\n", " u'034',\n", " u'034101',\n", " u'034226',\n", " u'0346',\n", " u'0349',\n", " u'0350',\n", " u'035020',\n", " u'035406',\n", " u'0357',\n", " u'0358',\n", " u'036',\n", " u'0362',\n", " u'0366',\n", " u'037',\n", " u'0372',\n", " u'038',\n", " u'0382',\n", " u'0384',\n", " u'0391',\n", " u'03hz',\n", " u'03i',\n", " u'03k',\n", " u'04',\n", " u'040',\n", " u'0400',\n", " u'040254',\n", " u'040449',\n", " u'041',\n", " u'0410',\n", " u'04110',\n", " u'041343',\n", " u'041741',\n", " u'042',\n", " u'0423',\n", " u'043',\n", " u'043426',\n", " u'043654',\n", " u'0437',\n", " u'043935',\n", " u'044140',\n", " u'044323',\n", " u'0444',\n", " u'044749',\n", " u'045',\n", " u'0454',\n", " u'045526',\n", " u'045612',\n", " u'045651',\n", " u'045u',\n", " u'047',\n", " u'0493',\n", " u'04g',\n", " u'04p',\n", " u'04q',\n", " u'05',\n", " u'050',\n", " u'0500',\n", " u'050046mvs104',\n", " u'050127',\n", " u'050311',\n", " u'050451',\n", " u'050550',\n", " u'051',\n", " u'0510',\n", " u'0511',\n", " u'051746',\n", " u'051942',\n", " u'052',\n", " u'052005',\n", " u'052120rap115',\n", " u'053',\n", " u'0530',\n", " u'0533',\n", " u'053333',\n", " u'053553',\n", " u'053736',\n", " u'053748rap115',\n", " u'053905',\n", " u'054820',\n", " u'055',\n", " u'055100',\n", " u'055109',\n", " u'055341',\n", " u'056',\n", " u'0565',\n", " u'0578',\n", " u'0582',\n", " u'059',\n", " u'0593',\n", " u'0598',\n", " u'05apr93',\n", " u'05l',\n", " u'06',\n", " u'060',\n", " u'0600',\n", " u'060010',\n", " u'060493101758',\n", " u'060493114752',\n", " u'060540',\n", " u'060553',\n", " u'06066',\n", " u'0608',\n", " u'061',\n", " u'06108',\n", " u'06111',\n", " u'061326',\n", " u'0615',\n", " u'062055',\n", " u'062219',\n", " u'062907',\n", " u'06320',\n", " u'063425',\n", " u'0635',\n", " u'064',\n", " u'064028',\n", " u'064804',\n", " u'0649',\n", " u'065',\n", " u'06520',\n", " u'0663',\n", " u'0666',\n", " u'0667',\n", " u'067',\n", " u'0674',\n", " u'06840',\n", " u'0688',\n", " u'069',\n", " u'06eh',\n", " u'06p',\n", " u'06paul',\n", " u'06w',\n", " u'07',\n", " u'070',\n", " u'0700',\n", " u'0705',\n", " u'07059',\n", " u'0709',\n", " u'071',\n", " u'071791',\n", " u'071823',\n", " u'0721',\n", " u'0729',\n", " u'073',\n", " u'073051',\n", " u'073457ripbc',\n", " u'0735',\n", " u'0739',\n", " u'074',\n", " u'074054',\n", " u'0747',\n", " u'074836',\n", " u'075',\n", " u'0751',\n", " u'075822',\n", " u'077',\n", " u'07748',\n", " u'0777',\n", " u'079',\n", " u'0792',\n", " u'0793',\n", " u'0795',\n", " u'08',\n", " u'0801',\n", " u'08057',\n", " u'081',\n", " u'081052',\n", " u'0812',\n", " u'0815',\n", " u'0820',\n", " u'0821',\n", " u'082152',\n", " u'0823',\n", " u'082430',\n", " u'0824e',\n", " u'082502acps6992',\n", " u'0826',\n", " u'083',\n", " u'083057',\n", " u'0832',\n", " u'083324',\n", " u'084',\n", " u'084042',\n", " u'0845',\n", " u'085',\n", " u'0850',\n", " u'08502',\n", " u'08520',\n", " u'085337',\n", " u'08540',\n", " u'085638',\n", " u'085848',\n", " u'086',\n", " u'0863',\n", " u'0866',\n", " u'087',\n", " u'088',\n", " u'0883',\n", " u'08836',\n", " u'089',\n", " u'0891',\n", " u'0895',\n", " u'08a283a0',\n", " u'08h',\n", " u'09',\n", " u'0900',\n", " u'090030',\n", " u'090731',\n", " u'0908',\n", " u'091',\n", " u'091051',\n", " u'091139',\n", " u'091202',\n", " u'091258',\n", " u'091844',\n", " u'092101',\n", " u'0922',\n", " u'092246dlmqc',\n", " u'092830',\n", " u'092954',\n", " u'0930',\n", " u'093227',\n", " u'093231',\n", " u'093300',\n", " u'093527',\n", " u'093914',\n", " u'094',\n", " u'094509',\n", " u'095',\n", " u'0950',\n", " u'0952',\n", " u'095220',\n", " u'0953',\n", " u'0962',\n", " u'0965',\n", " u'0969',\n", " u'0987',\n", " u'0988',\n", " u'099',\n", " u'0996',\n", " u'0_',\n", " u'0___',\n", " u'0a',\n", " u'0b',\n", " u'0b10',\n", " u'0b14',\n", " u'0b15',\n", " u'0b16',\n", " u'0bn',\n", " u'0c',\n", " u'0cg',\n", " u'0d',\n", " u'0d2',\n", " u'0db',\n", " u'0df',\n", " u'0e',\n", " u'0e1',\n", " u'0e9',\n", " u'0ek',\n", " u'0ep',\n", " u'0f',\n", " u'0fh',\n", " u'0fk',\n", " u'0g',\n", " u'0g4',\n", " u'0g8',\n", " u'0ggv',\n", " u'0gn',\n", " u'0h',\n", " u'0h2',\n", " u'0h9',\n", " u'0ha',\n", " u'0hd',\n", " u'0ht',\n", " u'0i',\n", " u'0ic',\n", " u'0iv',\n", " u'0ivbtm9',\n", " u'0ivbud',\n", " u'0ivbud9',\n", " u'0ivbudk',\n", " u'0ivbvl',\n", " u'0ivf2l',\n", " u'0j',\n", " u'0k',\n", " u'0k5',\n", " u'0kd',\n", " u'0kj',\n", " u'0km',\n", " u'0l',\n", " u'0m',\n", " u'0m75u',\n", " u'0m8b',\n", " u'0m8bnh',\n", " u'0ma',\n", " u'0max',\n", " u'0mk',\n", " u'0mk80',\n", " u'0mvbdi',\n", " u'0mvbgt',\n", " u'0mvmk',\n", " u'0n',\n", " u'0n1',\n", " u'0o',\n", " u'0p',\n", " u'0pd',\n", " u'0q',\n", " u'0qax',\n", " u'0qq',\n", " u'0qu',\n", " u'0qvq',\n", " u'0r',\n", " u'0r_',\n", " u'0ra6abh107h',\n", " u'0rdf',\n", " u'0rhj',\n", " u'0s',\n", " u'0sl',\n", " u'0t',\n", " u'0t7',\n", " u'0tb',\n", " u'0tbxn',\n", " u'0tbxom',\n", " u'0tg',\n", " u'0th',\n", " u'0tq',\n", " u'0tq33',\n", " u'0tq6',\n", " u'0u',\n", " u'0u1',\n", " u'0u14',\n", " u'0v',\n", " u'0va',\n", " u'0w',\n", " u'0w5',\n", " u'0w5r',\n", " u'0wa',\n", " u'0wc',\n", " u'0we',\n", " u'0wk',\n", " u'0x',\n", " u'0x0',\n", " u'0x00',\n", " u'0x01',\n", " u'0x03',\n", " u'0x0f',\n", " u'0x10',\n", " u'0x100',\n", " u'0x20',\n", " u'0x21',\n", " u'0x2e0',\n", " u'0x30',\n", " u'0x37f',\n", " u'0x3c',\n", " u'0x3f',\n", " u'0xff',\n", " u'0y',\n", " u'0z',\n", " u'10',\n", " u'100',\n", " u'1000',\n", " u'10000',\n", " u'100000',\n", " u'100015',\n", " u'100024',\n", " u'10009',\n", " u'1000cc',\n", " u'1000r',\n", " u'1000s',\n", " u'1000w',\n", " u'1000yds',\n", " u'1001',\n", " u'10012',\n", " u'10016',\n", " u'10018',\n", " u'1002',\n", " u'10022',\n", " u'1003',\n", " u'10036',\n", " u'100387',\n", " u'100444',\n", " u'100452',\n", " u'1005',\n", " u'1006',\n", " u'1007',\n", " u'1008',\n", " u'1009',\n", " u'100921rk0vsanu',\n", " u'10098',\n", " u'100dpi',\n", " u'100hz',\n", " u'100k',\n", " u'100km',\n", " u'100lez',\n", " u'100m',\n", " u'100ma',\n", " u'100mb',\n", " u'100mhz',\n", " u'100mph',\n", " u'100ns',\n", " u'100s',\n", " u'101',\n", " u'1010',\n", " u'10101',\n", " u'101010',\n", " u'101044',\n", " u'1011',\n", " u'10115',\n", " u'10118',\n", " u'1012',\n", " u'101241',\n", " u'1013',\n", " u'101323',\n", " u'1014',\n", " u'1015',\n", " u'1016',\n", " u'101636',\n", " u'1017',\n", " u'10176',\n", " u'10179',\n", " u'1018',\n", " u'1019',\n", " u'101944',\n", " u'101957',\n", " u'101e',\n", " u'102',\n", " u'1020',\n", " u'102007',\n", " u'10206',\n", " u'1021',\n", " u'1023',\n", " u'1024',\n", " u'1024x758',\n", " u'1024x768',\n", " u'1024x768x16',\n", " u'1024x768x256',\n", " u'1024x768x65536',\n", " u'1024x768x70',\n", " u'1024x786x24',\n", " u'1025',\n", " u'10250',\n", " u'1026',\n", " u'1027',\n", " u'10273',\n", " u'10274',\n", " u'102756',\n", " u'1028',\n", " u'1029',\n", " u'102nd',\n", " u'103',\n", " u'1030',\n", " u'103038',\n", " u'1031',\n", " u'1032',\n", " u'103237',\n", " u'10326',\n", " u'1033',\n", " u'1034',\n", " u'1035',\n", " u'1036',\n", " u'1037',\n", " u'1038',\n", " u'1039',\n", " u'103rd',\n", " u'104',\n", " u'1040',\n", " u'104158',\n", " u'1042',\n", " u'1043',\n", " u'10438',\n", " u'1045',\n", " u'1046',\n", " u'1047',\n", " u'104746',\n", " u'1048',\n", " u'1049',\n", " u'105',\n", " u'1050',\n", " u'1051',\n", " u'10510',\n", " u'1053',\n", " u'105307',\n", " u'1054',\n", " u'1055',\n", " u'1056',\n", " u'1057',\n", " u'105738',\n", " u'1058',\n", " u'10580',\n", " u'105809',\n", " u'1059',\n", " u'105m',\n", " u'105mb',\n", " u'106',\n", " u'1060',\n", " u'10601',\n", " u'1061',\n", " u'10615',\n", " u'1062',\n", " u'1063',\n", " u'1064',\n", " u'1065',\n", " u'1066',\n", " u'10669',\n", " u'1067',\n", " u'1068',\n", " u'1069',\n", " u'10694',\n", " u'106ps',\n", " u'107',\n", " u'1070',\n", " u'1071',\n", " u'10711',\n", " u'1072',\n", " u'1073',\n", " u'1074',\n", " u'10748539',\n", " u'1075',\n", " u'1076',\n", " u'1077',\n", " u'1078',\n", " u'1079',\n", " u'10792',\n", " u'108',\n", " u'1080',\n", " u'1081',\n", " u'1082',\n", " u'1083',\n", " u'1084',\n", " u'10847',\n", " u'1085',\n", " u'1088',\n", " u'10886',\n", " u'1089',\n", " u'10890',\n", " u'109',\n", " u'1090',\n", " u'10901',\n", " u'1091',\n", " u'1093',\n", " u'1094',\n", " u'10946',\n", " u'1095',\n", " u'1096',\n", " u'1097',\n", " u'1098',\n", " u'1099',\n", " u'10_',\n", " u'10a',\n", " u'10base',\n", " u'10baset',\n", " u'10c',\n", " u'10cm',\n", " u'10e20',\n", " u'10h',\n", " u'10k',\n", " u'10km',\n", " u'10m',\n", " u'10mb',\n", " u'10mhz',\n", " u'10mil',\n", " u'10min',\n", " u'10mm',\n", " u'10pm',\n", " u'10s',\n", " u'10th',\n", " u'10v',\n", " u'10w',\n", " u'10w40',\n", " u'10x',\n", " u'10x20',\n", " u'11',\n", " u'110',\n", " u'1100',\n", " u'110021',\n", " u'11004',\n", " u'1101',\n", " u'1102',\n", " u'1105',\n", " u'11074',\n", " u'1108',\n", " u'1109',\n", " u'110m',\n", " u'110v',\n", " u'110vac',\n", " u'111',\n", " u'1111',\n", " u'11111101b',\n", " u'1113',\n", " u'1114',\n", " u'1115',\n", " u'11150',\n", " u'111652',\n", " u'111713',\n", " u'1118',\n", " u'11181',\n", " u'1119',\n", " u'111919',\n", " u'112',\n", " u'1120',\n", " u'1121',\n", " u'1122',\n", " u'11229',\n", " u'11230',\n", " u'1124',\n", " u'1126',\n", " u'1127',\n", " u'1128',\n", " u'1129',\n", " u'11292',\n", " u'112f',\n", " u'113',\n", " u'11311',\n", " u'113128',\n", " u'1132',\n", " u'113223',\n", " u'1133',\n", " u'11331',\n", " u'1134',\n", " u'1135',\n", " u'11353',\n", " u'11361',\n", " u'11363',\n", " u'1137',\n", " u'1138',\n", " u'113956',\n", " u'113p',\n", " u'113q',\n", " u'113qs',\n", " u'113qw',\n", " u'113s',\n", " u'113s1',\n", " u'114',\n", " u'1140',\n", " u'1141',\n", " u'114127',\n", " u'11414',\n", " u'114158',\n", " u'1142',\n", " u'1143',\n", " u'1144',\n", " u'11448',\n", " u'1145',\n", " u'1145w1',\n", " u'1146',\n", " u'11467',\n", " u'1147',\n", " u'11471',\n", " u'11473',\n", " u'1147902781',\n", " u'1149',\n", " u'115',\n", " u'1150',\n", " u'115080',\n", " u'1151',\n", " u'11522',\n", " u'115288',\n", " u'115290',\n", " u'1152x900',\n", " u'115300',\n", " u'115313',\n", " u'115397',\n", " u'1154',\n", " u'115437',\n", " u'11546',\n", " u'11548',\n", " u'1155',\n", " u'115511',\n", " u'115565',\n", " u'1156',\n", " u'1157',\n", " u'115707',\n", " u'11573',\n", " u'115863',\n", " u'1159',\n", " u'115a',\n", " u'115vac',\n", " u'116',\n", " u'1160',\n", " u'116005',\n", " u'1161',\n", " u'11613',\n", " u'1163',\n", " u'116305',\n", " u'11632',\n", " u'1164',\n", " u'1165',\n", " u'11670',\n", " u'1169',\n", " u'11690',\n", " u'117',\n", " u'1170',\n", " u'11732',\n", " u'1174',\n", " u'1175',\n", " u'11751',\n", " u'1176',\n", " u'11769k',\n", " u'1177',\n", " u'11770',\n", " u'11782',\n", " u'1179',\n", " u'11797',\n", " u'118',\n", " u'1180',\n", " u'11800',\n", " u'11812',\n", " u'11825',\n", " u'11830',\n", " u'11836',\n", " u'1185',\n", " u'1186',\n", " u'11861',\n", " u'11888',\n", " u'119',\n", " u'1190',\n", " u'1192',\n", " u'1192d',\n", " u'1194',\n", " u'1196',\n", " u'11971',\n", " u'1198',\n", " u'11a',\n", " u'11b',\n", " u'11h',\n", " u'11k',\n", " u'11oq',\n", " u'11pm',\n", " u'11sdpa',\n", " u'11th',\n", " u'12',\n", " u'120',\n", " u'1200',\n", " u'12009',\n", " u'1200cc',\n", " u'1200dpi',\n", " u'1200x',\n", " u'1201',\n", " u'1203',\n", " u'120311',\n", " u'120399',\n", " u'1204',\n", " u'120466',\n", " u'1205',\n", " u'12050',\n", " u'12056',\n", " u'1206',\n", " u'120666',\n", " u'1207',\n", " u'12073',\n", " u'1208',\n", " u'1209',\n", " u'12091',\n", " u'12092',\n", " u'120958',\n", " u'120k',\n", " u'120km',\n", " u'120kvolt',\n", " u'120lb',\n", " u'120m',\n", " u'120mb',\n", " u'120mph',\n", " u'120v',\n", " u'120vac',\n", " u'121',\n", " u'1210',\n", " u'121019',\n", " u'1211',\n", " u'121134',\n", " u'1212',\n", " u'121236',\n", " u'1213',\n", " u'12134',\n", " u'12139',\n", " u'1214',\n", " u'121411',\n", " u'1215',\n", " u'12176',\n", " u'12180',\n", " u'12187',\n", " u'122',\n", " u'1220',\n", " ...]" ] } ], "prompt_number": 135 }, { "cell_type": "code", "collapsed": false, "input": [ "for i in range(true_k):\n", " print(\"Cluster %d:\" % i)\n", " for ind in order_centroids[i, :10]:\n", " print terms[ind]\n", " print" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Cluster 0:\n", "turkish\n", "armenian\n", "armenians\n", "armenia\n", "turks\n", "argic\n", "turkey\n", "serdar\n", "genocide\n", "zuma\n", "\n", "Cluster 1:\n", "com\n", "sandvik\n", "stratus\n", "cramer\n", "optilink\n", "fbi\n", "kent\n", "people\n", "article\n", "islam\n", "\n", "Cluster 2:\n", "drive\n", "scsi\n", "ide\n", "controller\n", "drives\n", "hard\n", "disk\n", "floppy\n", "bus\n", "mac\n", "\n", "Cluster 3:\n", "baseball\n", "year\n", "game\n", "team\n", "games\n", "players\n", "runs\n", "braves\n", "pitching\n", "article\n", "\n", "Cluster 4:\n", "intercon\n", "amanda\n", "walker\n", "clipper\n", "com\n", "corporation\n", "systems\n", "herndon\n", "crypto\n", "chaos\n", "\n", "Cluster 5:\n", "nasa\n", "space\n", "gov\n", "henry\n", "alaska\n", "toronto\n", "moon\n", "zoo\n", "larc\n", "spencer\n", "\n", "Cluster 6:\n", "geb\n", "banks\n", "gordon\n", "pitt\n", "cs\n", "dsl\n", "n3jxp\n", "cadre\n", "chastity\n", "shameful\n", "\n", "Cluster 7:\n", "hockey\n", "team\n", "ca\n", "game\n", "nhl\n", "play\n", "players\n", "season\n", "leafs\n", "toronto\n", "\n", "Cluster 8:\n", "uk\n", "ac\n", "university\n", "cam\n", "dcs\n", "ed\n", "mathew\n", "posting\n", "host\n", "nntp\n", "\n", "Cluster 9:\n", "card\n", "video\n", "drivers\n", "vga\n", "monitor\n", "cards\n", "bus\n", "windows\n", "driver\n", "diamond\n", "\n", "Cluster 10:\n", "key\n", "clipper\n", "encryption\n", "chip\n", "keys\n", "escrow\n", "government\n", "com\n", "nsa\n", "algorithm\n", "\n", "Cluster 11:\n", "__\n", "___\n", "berkeley\n", "ax\n", "simms\n", "_____\n", "____\n", "baalke\n", "vram\n", "jpl\n", "\n", "Cluster 12:\n", "israel\n", "israeli\n", "jews\n", "arab\n", "jake\n", "arabs\n", "lebanese\n", "israelis\n", "adam\n", "policy\n", "\n", "Cluster 13:\n", "keith\n", "caltech\n", "livesey\n", "sgi\n", "solntze\n", "wpd\n", "jon\n", "schneider\n", "cco\n", "morality\n", "\n", "Cluster 14:\n", "car\n", "cars\n", "com\n", "engine\n", "oil\n", "dealer\n", "radar\n", "article\n", "just\n", "good\n", "\n", "Cluster 15:\n", "access\n", "digex\n", "pat\n", "express\n", "net\n", "online\n", "prb\n", "communications\n", "com\n", "greenbelt\n", "\n", "Cluster 16:\n", "windows\n", "window\n", "dos\n", "file\n", "files\n", "ms\n", "program\n", "com\n", "mouse\n", "use\n", "\n", "Cluster 17:\n", "god\n", "jesus\n", "bible\n", "christians\n", "people\n", "christ\n", "christian\n", "faith\n", "believe\n", "church\n", "\n", "Cluster 18:\n", "gun\n", "guns\n", "people\n", "firearms\n", "com\n", "weapons\n", "don\n", "crime\n", "control\n", "militia\n", "\n", "Cluster 19:\n", "com\n", "university\n", "posting\n", "host\n", "article\n", "nntp\n", "know\n", "like\n", "ca\n", "just\n", "\n" ] } ], "prompt_number": 143 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }