# Predict wine quality

Use the latest versions of DataFrame and KotlinDL libraries from [version repository](https://github.com/Kotlin/kotlin-jupyter-libraries).

In [1]:
%use dataframe

In [2]:
%use kotlin-dl

Read the dataframe from CSV and print the first few lines of it

In [3]:
val rawDf = DataFrame.readCsv(fileOrUrl = "winequality-red.csv", delimiter = ';')
rawDf.head()

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
7400000,700000,0,1900000,76000,11000000,34000000,997800,3510000,560000,9400000,5
7800000,880000,0,2600000,98000,25000000,67000000,996800,3200000,680000,9800000,5
7800000,760000,40000,2300000,92000,15000000,54000000,997000,3260000,650000,9800000,5
11200000,280000,560000,1900000,75000,17000000,60000000,998000,3160000,580000,9800000,6
7400000,700000,0,1900000,76000,11000000,34000000,997800,3510000,560000,9400000,5


Note: For formatting, the DataFrame needs to be rendered as HTML. This means that when running in Kotlin Notebook, "Render DataFrame tables natively" needs to be turned off.

In [4]:
rawDf.corr().format { colsOf<Double>() }.with { 
    linearBg(value = it, from = -1.0 to red, to = 1.0 to green)
}

column,fixed acidity,volatile acidity,residual sugar,chlorides,density,pH,sulphates,alcohol,quality
fixed acidity,1000000,-256131,114777,93705,668047,-682978,183006,-61668,124052
volatile acidity,-256131,1000000,1918,61298,22026,234937,-260987,-202288,-390558
residual sugar,114777,1918,1000000,55610,355283,-85652,5527,42075,13732
chlorides,93705,61298,55610,1000000,200632,-265026,371260,-221141,-128907
density,668047,22026,355283,200632,1000000,-341699,148506,-496180,-174919
pH,-682978,234937,-85652,-265026,-341699,1000000,-196648,205633,-57731
sulphates,183006,-260987,5527,371260,148506,-196648,1000000,93595,251397
alcohol,-61668,-202288,42075,-221141,-496180,205633,93595,1000000,476166
quality,124052,-390558,13732,-128907,-174919,-57731,251397,476166,1000000


Based on the correlation, we can remove some columns, they seem to be insignificant

In [5]:
val df = rawDf.remove { `free sulfur dioxide` and `residual sugar` and pH }
df

fixed acidity,volatile acidity,citric acid,chlorides,total sulfur dioxide,density,sulphates,alcohol,quality
7400000,700000,0,76000,34000000,997800,560000,9400000,5
7800000,880000,0,98000,67000000,996800,680000,9800000,5
7800000,760000,40000,92000,54000000,997000,650000,9800000,5
11200000,280000,560000,75000,60000000,998000,580000,9800000,6
7400000,700000,0,76000,34000000,997800,560000,9400000,5
7400000,660000,0,75000,40000000,997800,560000,9400000,5
7900000,600000,60000,69000,59000000,996400,460000,9400000,5
7300000,650000,0,65000,21000000,994600,470000,10000000,7
7800000,580000,20000,73000,18000000,996800,570000,9500000,7
7500000,500000,360000,71000,102000000,997800,800000,10500000,5


## Predict wine quality: first approach

In [6]:
// Simple converter function between DataFrame and KotlinDL data representations
fun <T> DataFrame<T>.toOnHeapDataset(labelColumnName: String): OnHeapDataset {
    return OnHeapDataset.create(
        dataframe = this,
        yColumn = labelColumnName
    )
}

fun OnHeapDataset.Companion.create(
    dataframe: DataFrame<Any?>,
    yColumn: String
): OnHeapDataset {
    fun extractX(): Array<FloatArray> =
        dataframe.remove(yColumn).rows()
            .map { (it.values() as List<Float>).toFloatArray() }.toTypedArray()

    fun extractY(): FloatArray =
        dataframe.get { yColumn<Float>() }.toList().toFloatArray()

    return create(
        ::extractX,
        ::extractY
    )
}

In [7]:
val (train, test) = df.convert { colsOf<Double>() }.toFloat()
    .toOnHeapDataset(labelColumnName = "quality")
    .split(0.8)

Define simple neural network with only 2 dense layers

In [8]:
val inputNeurons = train.x[0].size.toLong()

val model = Sequential.of(
    Input(
        inputNeurons,
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal(),
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal(),
    ),
    Dense(
        outputSize = 1,
        activation = Activations.Linear,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal(),
    )
)

java.lang.UnsatisfiedLinkError: Cannot find TensorFlow native library for OS: darwin, architecture: aarch64. See https://github.com/tensorflow/tensorflow/tree/master/tensorflow/java/README.md for possible solutions (such as building the library from source). Additional information on attempts to find the native library can be obtained by adding org.tensorflow.NativeLibrary.DEBUG=1 to the system properties of the JVM.

In [None]:
model.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)

In [None]:
model.printSummary()

Train it!

In [None]:
val trainHist = model.fit(train, batchSize = 500, epochs=2000)

In [None]:
trainHist.epochHistory.toDataFrame().tail()

Let's check that our network predicts values more or less correctly:

In [None]:
model.predictSoftly(test.x[9])[0]

In [None]:
test.y[9]

Close the model:

In [None]:
model.close()

## Predict wine quality: second approach

In [None]:
data class TrainTestSplitResult<T>(
    val trainX: DataFrame<T>,
    val trainY: DataFrame<T>,
    val testX: DataFrame<T>,
    val testY: DataFrame<T>,
)

fun <T> trainTestSplit(
    d: DataFrame<T>,
    col: String,
    trainPart: Double,
): TrainTestSplitResult<T> {
    val n = d.count()
    val trainN = ceil(n * trainPart).toInt()

    val shuffledInd = (0 until n).shuffled()
    val trainInd = shuffledInd.subList(0, trainN)
    val testInd = shuffledInd.subList(trainN, n)

    val train = d[trainInd]
    val test = d[testInd]

    val trainX = train.select { all().except(cols(col)) }
    val trainY = train.select(col)

    val testX = test.select { all().except(cols(col)) }
    val testY = test.select(col)

    return TrainTestSplitResult(trainX, trainY, testX, testY)
}

Let's create and then train the model as we did before

In [None]:
val (trainX, trainY, testX, testY) =
    trainTestSplit(df, "quality", 0.8)

In [None]:
fun <T> DataFrame<T>.toX(): Array<FloatArray> =
    merge { colsOf<Number>() }.by { it.map { it.toFloat() }.toFloatArray() }.into("X")
        .get { "X"<FloatArray>() }
        .toList()
        .toTypedArray()

In [None]:
fun <T> DataFrame<T>.toY(): FloatArray = 
    get { "quality"<Int>() }
        .asIterable()
        .map { it.toFloat() }
        .toFloatArray()

In [None]:
val trainXDL = trainX.toX()
val trainYDL = trainY.toY()
val testXDL = testX.toX()
val testYDL = testY.toY()

In [None]:
val trainKotlinDLDataset = OnHeapDataset.create({ trainXDL }, { trainYDL })
val testKotlinDLDataset = OnHeapDataset.create({ testXDL }, { testYDL })

In [None]:
val inputNeurons = train.x[0].size.toLong()

val model2 = Sequential.of(
    Input(
        inputNeurons
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    ),
    Dense(
        outputSize = 1,
        activation = Activations.Linear,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    )
)
model2.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model2.printSummary()

In [None]:
val trainHist = model2.fit(train, batchSize = 500, epochs = 2000)
trainHist.epochHistory.toDataFrame().tail()

In [None]:
model2.predictSoftly(testXDL[9])[0]

In [None]:
testYDL[9]

We can also compare predicted and ground truth values to ensure predictions are correct

In [None]:
val predicted = testXDL.mapIndexed { i, _ ->
    round(model2.predictSoftly(testXDL[i])[0]).toInt()
}.toColumn("predicted")

val ground_truth = testYDL.mapIndexed { i, _ ->
    testYDL[i].toInt()
}.toColumn("ground_truth")

val predDf = dataFrameOf(predicted, ground_truth)

In [None]:
predDf.head()

In [None]:
val inds = List(10) { it + 1 }
val ctab = predDf
    .groupBy { ground_truth }.pivotCounts(inward = false) { predicted }
    .sortBy { ground_truth }

ctab.format { drop(1) }.perRowCol { row, col ->
    val y = col.name().toInt()
    val x = row.ground_truth
    val k = 1.0 - abs(x - y) / 10.0
    background(RGBColor(50, (50 + k * 200).toInt().toShort(), 50))
}

In [None]:
val predDf2 = predDf.add("avg_dev") { abs(predicted - ground_truth) }

In [None]:
predDf2.avg_dev.cast<Double>().describe()

In [None]:
predDf2.sortBy { avg_dev }[(0.7 * (319 - 1)).toInt()]

In [None]:
model2.close()