#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

#
# Generates random Gaussian-mixture data to test k-Means clustering algorithms
#
# INPUT PARAMETERS:
# ----------------------------------------------------------------------------
# NAME  TYPE   DEFAULT  MEANING
# ----------------------------------------------------------------------------
# nr    Int     ---     Number of records
# nf    Int     ---     Number of features
# nc    Int     ---     Number of clusters
# dc    Double  ---     St.dev. of cluster "centroid" features from zero mean
# dr    Double  ---     St.dev. of the 1-st feature in a record within cluster
# fbf   Double  ---     Feature bias factor: Stdev(last) / Stdev(1-st) feature
# cbf   Double  ---     Cluster bias factor: Prob[1-st clus] / Prob[k-th clus]
# X     String  ---     Location to write matrix X with generated data records
# C     String  ---     Location to write cluster "centroids" (Gaussian means)
# Y     String  ---     Location to write assignment of records to cluster ids
# YbyC  String  ---     Location to write rec-cluster assigns by min-dist to C
# ----------------------------------------------------------------------------
#
# Example:
# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=100000 nf=100
#     nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx YbyC=YbyC.mtx

print ("BEGIN K-MEANS GENERATOR SCRIPT");

num_records   = $nr;
num_features  = $nf;
num_centroids = $nc;
dist_per_feature_centroids = $dc;
dist_per_feature_first_record = $dr;
feature_bias_factor = $fbf;
cluster_bias_factor = $cbf;

fileX    = ifdef ($X, "X");
fileC    = ifdef ($C, "C");
fileY    = ifdef ($Y, "Y");
fileYbyC = ifdef ($YbyC, "YbyC");
fmt      = ifdef ($fmt, "text");

print ("Generating cluster distribution (mixture) centroids...");

C = Rand (rows = num_centroids, cols = num_features, pdf = "normal");
C = C * dist_per_feature_centroids;

print ("Generating record-to-cluster assignments...");

# Y is a multinomial in {1, ..., num_centroids} with 1 being more likely
# than "num_centroids" by the factor of "cluster_bias_factor"

rnd = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform");
if (cluster_bias_factor == 1.0) {
    Y = round (0.5 + rnd * num_centroids);
} else {
    rnd_scaled = rnd * (1 - cluster_bias_factor ^ (- num_centroids / (num_centroids - 1)));
    Y = round (0.5 - (num_centroids - 1) * log (1 - rnd_scaled) / log (cluster_bias_factor));
}

print ("Generating within-cluster random shifts...");

X_shift = Rand (rows = num_records, cols = num_features, pdf = "normal");
feature_factors = dist_per_feature_first_record * 
    exp ((seq (1, num_features) - 1) / (num_features - 1) * log (feature_bias_factor));
X_shift = X_shift %*% diag (feature_factors);

print ("Generating records by shifting from centroids..."); 

Y_bitmap_raw = table (seq (1, num_records), Y);
Y_bitmap = matrix (0, rows = num_records, cols = num_centroids);
Y_bitmap [, 1 : ncol (Y_bitmap_raw)] = Y_bitmap_raw;
X = Y_bitmap %*% C + X_shift;

print ("Computing record-to-cluster assignments by minimum centroid distance...");

D = t(t(-2 * (X %*% t(C))) + rowSums (C ^ 2));
P = (D <= rowMins (D));
aggr_P = t(cumsum (t(P)));
Y_by_C = rowSums (aggr_P == 0) + 1;

print ("Computing useful statistics...");

sumXsq = sum (X ^ 2);
default_wcss  = sumXsq - sum (colSums (X) ^ 2) / num_records;
attained_wcss = sumXsq + sum (rowMins (D));

print ("Default (single-cluster) WCSS = " + default_wcss);
print (num_centroids + "-cluster WCSS attained by the mixture centroids = " + attained_wcss);

print ("Writing out the resulting dataset...");

write (X, fileX, format = fmt);
write (C, fileC, format = fmt);
write (Y, fileY, format = fmt);
write (Y_by_C, fileYbyC, format = fmt);

print ("Please run the scoring script to compare " + fileY + " with " + fileYbyC); 

print ("DONE: K-MEANS GENERATOR SCRIPT");