% demo_simple.m % % This demo script shows the basic operation of some feature % selection algorithms studied in J. Pohjalainen, O. Rasanen & % S. Kadioglu: "Feature Selection Methods and Their Combinations in % High-Dimensional Classification of Speaker Likability, Intelligibility % and Personality Traits", Computer Speech and Language, 2014. % The methods involved in this script are SD, MI, SFS and RSFS. % % In the demo, a set of artificial features is generated from the Fisher's % Iris data by first adding noise and then random mapping the data points to % a higher-dimensional space and replacing some of the features with noise. % Then the most useful features are sought by different algorithms and % k-nearest neighbors (KNN) classifier is used to classify the data into % Iris classes based on the discovered feature subsets. % % Note that in this simplified demo, the same set of data samples is used % to perform feature selection, to train the KNN classifier and to evaluate it. % For a more realistic classification scenario, please see the % demo_cv.m where division to separate training, development and testing data is % accomplished by using a cross-validated evaluation scheme. % % Please see demo_cv.m for a more realistic demo using cross-validation % with an independent test set. % % (c) Jouni Pohjalainen & Okko Rasanen % % For Mathworks' classification demos on the original Fisher Iris data, % please see: % % http://www.mathworks.se/products/statistics/examples.html?file= % /products/demos/shipping/stats/classdemo.html % % Questions and comments can be sent to jpohjala@acoustics.hut.fi or % okko.rasanen@aalto.fi . echo off % Load Fisher's iris dataset (meas = samples, species = class labels) load fisheriris meas species % Convert class label strings into integer labels specs = unique(species); labels = zeros(size(species)); for k = 1:length(specs) labels(ismember(species,specs(k))) = k; end % Add Gaussian noise to the measurement data (original fisheriris is too easy for classification). noiselevel = 1; meas = meas+randn(size(meas)).*noiselevel; % Generate a set of new features through random projection from the % original 4 features to d dimensions. d = 200; M = randn(size(meas,2),d); M = sqrt(ones./(sum((M.*M)')))'*ones(1,size(M,2)).*M; % Normalize M rows features = meas*M; % Replace max 50% of the generated features with random noise features a = 1 + floor(size(features,2)*rand(round(d/2),1)); features(:,a) = randn(size(features,1),length(a)); fprintf('Feature selection using SD, MI, RSFS, SFS and SFFS\n'); fprintf('Please see the source code for more information\n'); fprintf('Evaluation started\n'); echo on %% Select features using different algorithms [F_MI,W_MI] = MI(features,labels,3); [F_SD,W_SD] = SD(features,labels,3); [F_RSFS,W_RSFS] = RSFS(features,features,labels,labels,'verbose',1,'max_delta',0.03); k_sfs = 5:5:20; % Values of KNN k parameter over which Sequential Forward Selection (SFS) is performed t_sfs = 3; % How many iterations is SFS run beyond the first detected performance maximum? [F_SFS,W_SFS] = SFS(features,features,labels,labels,k_sfs,t_sfs); k_sffs = 5:5:20; % Values of KNN k parameter over which Sequential Floating Forward Selection (SFS) is performed t_sffs = 3; % How many iterations is SFFS run beyond the first detected performance maximum? [F_SFFS,W_SFFS] = SFFS(features,features,labels,labels,k_sffs,t_sffs); echo off %% Test KNN classification accuracy with the different feature sets using the same data points for training and testing % (note that k = 1 always leads to 100% accuracy without an independent test set). k = 5; % k used in KNN classification hypos_orig = KNN(features,features,labels,k); fprintf('Original %d features: %0.2f%% correct.\n',size(features,2),sum(hypos_orig == labels)/length(labels)*100); hypos_SD = KNN(features(:,F_SD(1:10)),features(:,F_SD(1:10)),labels,k); fprintf('Best 10 features from SD: %0.2f%% correct.\n',sum(hypos_SD == labels)/length(labels)*100); hypos_MI = KNN(features(:,F_MI(1:10)),features(:,F_MI(1:10)),labels,k); fprintf('Best 10 features from MI: %0.2f%% correct.\n',sum(hypos_MI == labels)/length(labels)*100); hypos_RSFS = KNN(features(:,F_RSFS),features(:,F_RSFS),labels,k); fprintf('RSFS feature set (%d features): %0.2f%% correct.\n',length(F_RSFS),sum(hypos_RSFS == labels)/length(labels)*100); hypos_SFS = KNN(features(:,F_SFS),features(:,F_SFS),labels,k); fprintf('SFS feature set (%d features): %0.2f%% correct.\n',length(F_SFS),sum(hypos_SFS == labels)/length(labels)*100); hypos_SFFS = KNN(features(:,F_SFFS),features(:,F_SFFS),labels,k); fprintf('SFFS feature set (%d features): %0.2f%% correct.\n',length(F_SFFS),sum(hypos_SFFS == labels)/length(labels)*100);