% demo_cv.m
%
% This demo script shows the basic operation of some feature
% selection algorithms studied in J. Pohjalainen, O. Rasanen &
% S. Kadioglu: "Feature Selection Methods and Their Combinations in
% High-Dimensional Classification of Speaker Likability, Intelligibility
% and Personality Traits", Computer Speech and Language, 2014.
% The methods involved in this script are SD, MI and RSFS.
%
% In the demo, a set of artificial features is generated from the Fisher's
% Iris data by first adding noise and then random mapping the data points to
% a higher-dimensional space and replacing some of the features with noise.
% Then the most useful features are sought by different algorithms and
% k-nearest neighbors (KNN) classifier is used to classify the data into
% Iris classes based on the discovered feature subsets.
%
% 5-fold cross validation is performed on the data. For each
% cross-validation partition into training and test subsets, the training
% subset is used both as feature selection data and as training data for
% the eventual system, while the final class labelings are recorded for the
% test subset. Note that the training data is further divided into two
% halves in order to compute wrapper algorithm performance criterion on
% a set of samples distinct from the classifier training samples during the 
% feature selection stage.
%
% Please see demo_simple.m for a simpler (but unrealistic) demo without 
% the cross validation. 
%
% (c) Jouni Pohjalainen & Okko Rasanen
%
% For Mathworks' classification demos on the original Fisher Iris data,
% please see:
%
% http://www.mathworks.se/products/statistics/examples.html?file=
% /products/demos/shipping/stats/classdemo.html
%
% Questions and comments can be sent to jpohjala@acoustics.hut.fi or
% okko.rasanen@aalto.fi.

load fisheriris meas species 

% Convert class label strings into integer labels
specs = unique(species);
labels = zeros(size(species));
for k = 1:length(specs)
    labels(ismember(species,specs(k))) = k;
end

% Add Gaussian noise to the measurement data (original fisheriris is too easy for classification).
noiselevel = 1;
meas = meas+randn(size(meas)).*noiselevel;

% Generate a set of new features through random projection from the
% original 4 features to d dimensions.
d = 200;
M = randn(size(meas,2),d);
M = sqrt(ones./(sum((M.*M)')))'*ones(1,size(M,2)).*M; % Normalize M rows
features = meas*M;

% Replace max 50% of the generated features with random noise features
a = 1 + floor(size(features,2)*rand(round(d/2),1));
features(:,a) = randn(size(features,1),length(a));

fprintf('Feature selection using SD, MI, RSFS, SFS and SFFS\n');
fprintf('Please see the source code for more information\n');
fprintf('Evaluation started\n');

N = size(features,1);

k = 5;     % k used in KNN classification

% Test KNN classification accuracy with the different feature sets
%using 5-fold randomized cross-validation in training/testing data division
ncv = 5; 
cvblocksize = N/5;
dataorder = randperm(N);

hypos_orig = zeros(N,1);
hypos_SD = zeros(N,1);
hypos_MI = zeros(N,1);
hypos_RSFS = zeros(N,1);
hypos_SFFS = zeros(N,1);
hypos_SFS = zeros(N,1);

nfeat_RSFS = 0;
nfeat_SFS = 0;
nfeat_SFFS = 0;

for cvi=1:ncv
    fprintf('Cross validation partition %d/%d\n',cvi,ncv);
    
    % test indices for this cross validation round
    testidx = dataorder(((cvi-1)*cvblocksize+1):min(N,cvi*cvblocksize));
    % train indices for this cross validation round
    trainidx = setdiff(1:N,testidx);
    trainidx = trainidx(randperm(length(trainidx)));
    
    Ntrain = length(trainidx);
    % Divide training data into two halves ("train + dev") 
    trainidx1 = trainidx(1:round(Ntrain/2));
    trainidx2 = trainidx((round(Ntrain/2)+1):end);

    %% Select features using different algorithms
    [F_MI,W_MI] = MI(features(trainidx,:),labels(trainidx),3);
    [F_SD,W_SD] = SD(features(trainidx,:),labels(trainidx),3);
        
    [F_RSFS,W_RSFS] = RSFS(features(trainidx1,:),features(trainidx2,:),labels(trainidx1),labels(trainidx2),'verbose',1);
        
    k_sfs = 5:5:20; % Values of KNN k parameter over which Sequential Forward Selection (SFS) is performed
    t_sfs = 3;      % How many iterations is SFS run beyond the first detected performance maximum? 
    [F_SFS,W_SFS] = SFS(features(trainidx1,:),features(trainidx2,:),labels(trainidx1),labels(trainidx2),k_sfs,t_sfs);
    
    k_sffs = 5:5:20; % Values of KNN k parameter over which Sequential Floating Forward Selection (SFS) is performed
    t_sffs = 3;      % How many iterations is SFFS run beyond the first detected performance maximum? 
    [F_SFFS,W_SFFS] = SFFS(features(trainidx1,:),features(trainidx2,:),labels(trainidx1),labels(trainidx2),k_sffs,t_sffs);

    % perform classification of test data
    hypos_orig(testidx) = KNN(features(trainidx,:),features(testidx,:),labels(trainidx),k);
    hypos_SD(testidx) = KNN(features(trainidx,F_SD(1:10)),features(testidx,F_SD(1:10)),labels(trainidx),k);
    hypos_MI(testidx) = KNN(features(trainidx,F_MI(1:10)),features(testidx,F_MI(1:10)),labels(trainidx),k);
    hypos_RSFS(testidx) = KNN(features(trainidx,F_RSFS),features(testidx,F_RSFS),labels(trainidx),k);
    hypos_SFS(testidx) = KNN(features(trainidx,F_SFS),features(testidx,F_SFS),labels(trainidx),k);
    hypos_SFFS(testidx) = KNN(features(trainidx,F_SFFS),features(testidx,F_SFFS),labels(trainidx),k);
    
    % to compute the average number of features selected by RSFS, SFS and SFFS
    nfeat_RSFS = nfeat_RSFS + length(F_RSFS);
    nfeat_SFS = nfeat_SFS + length(F_SFS);
    nfeat_SFFS = nfeat_SFFS + length(F_SFFS);
end
nfeat_RSFS = nfeat_RSFS/ncv;
nfeat_SFS  = nfeat_SFS/ncv;
nfeat_SFFS = nfeat_SFFS/ncv;

% Print results over all 5 folds
fprintf('Original %d features: %0.2f%% correct.\n',size(features,2),sum(hypos_orig == labels)/length(labels)*100);
fprintf('Best 10 features from SD: %0.2f%% correct.\n',sum(hypos_SD == labels)/length(labels)*100);
fprintf('Best 10 features from MI: %0.2f%% correct.\n',sum(hypos_MI == labels)/length(labels)*100);
fprintf('RSFS feature sets (%0.1f features on average): %0.2f%% correct.\n',nfeat_RSFS,sum(hypos_RSFS == labels)/length(labels)*100);
fprintf('SFS feature sets (%0.1f features on average): %0.2f%% correct.\n',nfeat_SFS,sum(hypos_SFS == labels)/length(labels)*100);
fprintf('SFFS feature sets (%0.1f features on average): %0.2f%% correct.\n',nfeat_SFFS,sum(hypos_SFFS == labels)/length(labels)*100);