# A Network Tour of Data Science
### &nbsp; &nbsp; &nbsp; Xavier Bresson, Winter 2016/17
## Exercise 4 - Code 2 : Unsupervised Learning
## Unsupervised Clustering with Kernel K-Means  

In [1]:
# Load libraries

# Math
import numpy as np

# Visualization 
%matplotlib notebook 
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import ndimage

# Print output of LFR code
import subprocess

# Sparse matrix
import scipy.sparse
import scipy.sparse.linalg

# 3D visualization
import pylab
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot

# Import data
import scipy.io

# Import functions in lib folder
import sys
sys.path.insert(1, 'lib')

# Import helper functions
%load_ext autoreload
%autoreload 2
from lib.utils import construct_kernel
from lib.utils import compute_kernel_kmeans_EM
from lib.utils import compute_kernel_kmeans_spectral
from lib.utils import compute_purity

# Import distance function
import sklearn.metrics.pairwise

# Remove warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load MNIST raw data images
mat = scipy.io.loadmat('datasets/mnist_raw_data.mat')
X = mat['Xraw']
n = X.shape[0]
d = X.shape[1]
Cgt = mat['Cgt'] - 1; Cgt = Cgt.squeeze()
nc = len(np.unique(Cgt))
print('Number of data =',n)
print('Data dimensionality =',d);
print('Number of classes =',nc);

Number of data = 2000
Data dimensionality = 784
Number of classes = 10


**Question 1a:** What is the clustering accuracy of standard/linear K-Means?<br>
Hint: You may use functions *Ker=construct_kernel(X,'linear')* to compute the
linear kernel and *[C_kmeans, En_kmeans]=compute_kernel_kmeans_EM(n_classes,Ker,Theta,10)* with *Theta= np.ones(n)* to run the standard K-Means algorithm, and *accuracy = compute_purity(C_computed,C_solution,n_clusters)* that returns the
accuracy.

In [3]:
# Your code here
Ker = construct_kernel(X,'linear') # Compute linear Kernel for standard K-Means
Theta = np.ones(n) # Equal weight for each data
[C_kmeans,En_kmeans] = compute_kernel_kmeans_EM(nc,Ker,Theta,10)
acc= compute_purity(C_kmeans,Cgt,nc)
print('accuracy standard kmeans=',acc)

Construct Linear Kernel
accuracy standard kmeans= 13.200000000000001


**Question 1b:** What is the clustering accuracy for the kernel K-Means algorithm with<br>
(1) Gaussian Kernel for the EM approach and the Spectral approach?<br>
(2) Polynomial Kernel for the EM approach and the Spectral approach?<br>
Hint: You may use functions *Ker=construct_kernel(X,'gaussian')* and *Ker=construct_kernel(X,'polynomial',[1,0,2])* to compute the non-linear kernels<br>
Hint: You may use functions *C_kmeans,__ = compute_kernel_kmeans_EM(K,Ker,Theta,10)* for the EM kernel KMeans algorithm and *C_kmeans,__ = compute_kernel_kmeans_spectral(K,Ker,Theta,10)* for the Spectral kernel K-Means algorithm.<br>

In [4]:
# Your code here
Ker = construct_kernel(X,'gaussian') # Compute Gaussian Kernel
Theta = np.ones(n) # Equal weight for each data

C_kmeans,_ = compute_kernel_kmeans_EM(nc,Ker,Theta,10)
acc = compute_purity(C_kmeans,Cgt,nc)
print('accuracy non-linear kmeans with EM=',acc)

C_kmeans,_ = compute_kernel_kmeans_spectral(nc,Ker,Theta,10)
acc = compute_purity(C_kmeans,Cgt,nc)
print('accuracy non-linear kmeans with SPECTRAL=',acc)

Construct Gaussian Kernel
accuracy non-linear kmeans with EM= 61.050000000000004
Construct Linear Kernel
accuracy non-linear kmeans with SPECTRAL= 52.1


In [5]:
# Your code here
Ker = construct_kernel(X,'polynomial',[1,0,2])
Theta = np.ones(n) # Equal weight for each data

C_kmeans, En_kmeans = compute_kernel_kmeans_EM(nc,Ker,Theta,10)
acc = compute_purity(C_kmeans,Cgt,nc)
print('accuracy non-linear kmeans with EM=',acc)

[C_kmeans,En_kmeans] = compute_kernel_kmeans_spectral(nc,Ker,Theta,10)
acc = compute_purity(C_kmeans,Cgt,nc)
print('accuracy non-linear kmeans with SPECTRAL=',acc)

Construct Polynomial Kernel
accuracy non-linear kmeans with EM= 49.95
Construct Linear Kernel
accuracy non-linear kmeans with SPECTRAL= 50.849999999999994


**Question 1c:** What is the clustering accuracy for the kernel K-Means algorithm with<br>
(1) KNN_Gaussian Kernel for the EM approach and the Spectral approach?<br>
(2) KNN_Cosine_Binary Kernel for the EM approach and the Spectral approach?<br>
You can test for the value KNN_kernel=50.<br>
Hint: You may use functions *Ker = construct_kernel(X,'kNN_gaussian',KNN_kernel)*
and *Ker = construct_kernel(X,'kNN_cosine_binary',KNN_kernel)* to compute the
non-linear kernels.

In [6]:
# Your code here
KNN_kernel = 50
Ker = construct_kernel(X,'kNN_gaussian',KNN_kernel)
Theta = np.ones(n) # Equal weight for each data

C_kmeans,_ = compute_kernel_kmeans_EM(nc,Ker,Theta,10)
acc = compute_purity(C_kmeans,Cgt,nc)
print('accuracy non-linear kmeans with EM=',acc)

C_kmeans,_ = compute_kernel_kmeans_spectral(nc,Ker,Theta,10)
acc = compute_purity(C_kmeans,Cgt,nc)
print('accuracy non-linear kmeans with SPECTRAL=',acc)

Construct kNN Gaussian Kernel
accuracy non-linear kmeans with EM= 54.55
Construct Linear Kernel
accuracy non-linear kmeans with SPECTRAL= 58.650000000000006


In [7]:
# Your code here
KNN_kernel = 50
Ker = construct_kernel(X,'kNN_cosine_binary',KNN_kernel)
Theta = np.ones(n) # Equal weight for each data

C_kmeans,_ = compute_kernel_kmeans_EM(nc,Ker,Theta,10)
acc = compute_purity(C_kmeans,Cgt,nc)
print('accuracy non-linear kmeans with EM=',acc)

C_kmeans,_ = compute_kernel_kmeans_spectral(nc,Ker,Theta,10)
acc = compute_purity(C_kmeans,Cgt,nc)
print('accuracy non-linear kmeans with SPECTRAL=',acc)

Construct kNN Cosine Binary Kernel
accuracy non-linear kmeans with EM= 58.550000000000004
Construct Linear Kernel
accuracy non-linear kmeans with SPECTRAL= 60.35
