# Predict wine quality

Use the latest versions of DataFrame and KotlinDL libraries from the [version repository](https://github.com/Kotlin/kotlin-jupyter-libraries).

To run this notebook in Kotlin Notebook, please make sure "Resolve multiplatform dependencies" is turned OFF for this library

In [1]:
%useLatestDescriptors

In [2]:
%use dataframe

In [3]:
%use kotlin-dl

Read the dataframe from CSV and print the first few lines of it

In [4]:
val rawDf = DataFrame.readCsv(fileOrUrl = "winequality-red.csv", delimiter = ';')
rawDf.head()

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


_Note:_ For formatting, the DataFrame needs to be rendered as HTML.
This means that when running in Kotlin Notebook, "Render DataFrame tables natively" needs to be turned off, or we need
to explicitly turn the dataframe into HTML.

In [6]:
rawDf.corr()
    .format { colsOf<Double>() }.with { linearBg(value = it, from = -1.0 to red, to = 1.0 to green) }
    .toHtml()

Based on the correlation, we can remove some columns, they seem to be insignificant

In [7]:
val df = rawDf.remove { `free sulfur dioxide` and `residual sugar` and pH }
df

fixed acidity,volatile acidity,citric acid,chlorides,total sulfur dioxide,density,sulphates,alcohol,quality
7.4,0.7,0.0,0.076,34.0,0.9978,0.56,9.4,5
7.8,0.88,0.0,0.098,67.0,0.9968,0.68,9.8,5
7.8,0.76,0.04,0.092,54.0,0.997,0.65,9.8,5
11.2,0.28,0.56,0.075,60.0,0.998,0.58,9.8,6
7.4,0.7,0.0,0.076,34.0,0.9978,0.56,9.4,5
7.4,0.66,0.0,0.075,40.0,0.9978,0.56,9.4,5
7.9,0.6,0.06,0.069,59.0,0.9964,0.46,9.4,5
7.3,0.65,0.0,0.065,21.0,0.9946,0.47,10.0,7
7.8,0.58,0.02,0.073,18.0,0.9968,0.57,9.5,7
7.5,0.5,0.36,0.071,102.0,0.9978,0.8,10.5,5


## Predict wine quality: first approach

In [8]:
// Simple converter function between DataFrame and KotlinDL data representations
fun <T> DataFrame<T>.toOnHeapDataset(labelColumnName: String): OnHeapDataset {
    return OnHeapDataset.create(
        dataframe = this,
        yColumn = labelColumnName
    )
}

fun OnHeapDataset.Companion.create(
    dataframe: DataFrame<Any?>,
    yColumn: String
): OnHeapDataset {
    fun extractX(): Array<FloatArray> =
        dataframe.remove(yColumn).rows()
            .map { (it.values() as List<Float>).toFloatArray() }.toTypedArray()

    fun extractY(): FloatArray =
        dataframe.get { yColumn<Float>() }.toList().toFloatArray()

    return create(
        ::extractX,
        ::extractY
    )
}

In [9]:
val (train, test) = df.convert { colsOf<Double>() }.toFloat()
    .toOnHeapDataset(labelColumnName = "quality")
    .split(0.8)

Define simple neural network with only 2 dense layers

In [10]:
val inputNeurons = train.x[0].size.toLong()

val model = Sequential.of(
    Input(
        inputNeurons,
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal(),
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal(),
    ),
    Dense(
        outputSize = 1,
        activation = Activations.Linear,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal(),
    )
)

In [11]:
model.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)

In [12]:
model.printSummary()

Model type: Sequential
______________________________________________________________________________
Layer (type)                           Output Shape              Param #      
input_1(Input)                         [None, 8]                 0            
______________________________________________________________________________
dense_2(Dense)                         [None, 80]                720          
______________________________________________________________________________
dense_3(Dense)                         [None, 80]                6480         
______________________________________________________________________________
dense_4(Dense)                         [None, 1]                 81           
______________________________________________________________________________
Total trainable params: 7281
Total frozen params: 0
Total params: 7281
______________________________________________________________________________


Train it!

In [13]:
val trainHist = model.fit(train, batchSize = 500, epochs=2000)

In [14]:
trainHist.epochHistory.toDataFrame().tail()

epochIndex,lossValue,metricValues,valLossValue,valMetricValues
1996,0.334851,[0.45112717151641846],,[NaN]
1997,0.334814,[0.45109668374061584],,[NaN]
1998,0.334778,[0.45106613636016846],,[NaN]
1999,0.334741,[0.45103588700294495],,[NaN]
2000,0.334705,[0.45100536942481995],,[NaN]


Let's check that our network predicts values more or less correctly:

In [15]:
model.predictSoftly(test.x[9])[0]

5.24972

In [16]:
test.y[9]

5.0

Close the model:

In [17]:
model.close()

## Predict wine quality: second approach

In [19]:
data class TrainTestSplitResult<T>(
    val trainX: DataFrame<T>,
    val trainY: DataFrame<T>,
    val testX: DataFrame<T>,
    val testY: DataFrame<T>,
)

fun <T> trainTestSplit(
    d: DataFrame<T>,
    col: String,
    trainPart: Double,
): TrainTestSplitResult<T> {
    val n = d.count()
    val trainN = ceil(n * trainPart).toInt()

    val shuffledInd = (0..<n).shuffled()
    val trainInd = shuffledInd.subList(0, trainN)
    val testInd = shuffledInd.subList(trainN, n)

    val train = d[trainInd]
    val test = d[testInd]

    val trainX = train.select { allExcept(col) }
    val trainY = train.select(col)

    val testX = test.select { allExcept(col) }
    val testY = test.select(col)

    return TrainTestSplitResult(trainX, trainY, testX, testY)
}

Let's create and then train the model as we did before

In [20]:
val (trainX, trainY, testX, testY) =
    trainTestSplit(df, "quality", 0.8)

In [22]:
fun <T> DataFrame<T>.toX(): Array<FloatArray> =
    merge { colsOf<Number>() }.by { it.map { it.toFloat() }.toFloatArray() }.into("X")
        .get { "X"<FloatArray>() }
        .toList()
        .toTypedArray()

In [23]:
fun <T> DataFrame<T>.toY(): FloatArray = 
    get { "quality"<Int>() }
        .asIterable()
        .map { it.toFloat() }
        .toFloatArray()

In [24]:
val trainXDL = trainX.toX()
val trainYDL = trainY.toY()
val testXDL = testX.toX()
val testYDL = testY.toY()

In [25]:
val trainKotlinDLDataset = OnHeapDataset.create({ trainXDL }, { trainYDL })
val testKotlinDLDataset = OnHeapDataset.create({ testXDL }, { testYDL })

In [26]:
val inputNeurons = train.x[0].size.toLong()

val model2 = Sequential.of(
    Input(
        inputNeurons
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    ),
    Dense(
        outputSize = 1,
        activation = Activations.Linear,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    )
)
model2.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model2.printSummary()

Model type: Sequential
______________________________________________________________________________
Layer (type)                           Output Shape              Param #      
input_1(Input)                         [None, 8]                 0            
______________________________________________________________________________
dense_2(Dense)                         [None, 80]                720          
______________________________________________________________________________
dense_3(Dense)                         [None, 80]                6480         
______________________________________________________________________________
dense_4(Dense)                         [None, 1]                 81           
______________________________________________________________________________
Total trainable params: 7281
Total frozen params: 0
Total params: 7281
______________________________________________________________________________


In [27]:
val trainHist = model2.fit(train, batchSize = 500, epochs = 2000)
trainHist.epochHistory.toDataFrame().tail()

epochIndex,lossValue,metricValues,valLossValue,valMetricValues
1996,0.334532,[0.4508610963821411],,[NaN]
1997,0.334495,[0.45082950592041016],,[NaN]
1998,0.334458,[0.45079800486564636],,[NaN]
1999,0.334421,[0.4507667124271393],,[NaN]
2000,0.334384,[0.45073509216308594],,[NaN]


In [28]:
model2.predictSoftly(testXDL[9])[0]

5.8768764

In [29]:
testYDL[9]

5.0

We can also compare predicted and ground truth values to ensure predictions are correct

In [30]:
val predicted = testXDL.mapIndexed { i, _ ->
    round(model2.predictSoftly(testXDL[i])[0]).toInt()
}.toColumn("predicted")

val ground_truth = testYDL.mapIndexed { i, _ ->
    testYDL[i].toInt()
}.toColumn("ground_truth")

val predDf = dataFrameOf(predicted, ground_truth)

In [31]:
predDf.head()

predicted,ground_truth
6,6
5,4
6,6
5,5
5,5


In [35]:
val inds = List(10) { it + 1 }
val ctab = predDf
    .groupBy { ground_truth }.pivotCounts(inward = false) { predicted }
    .sortBy { ground_truth }

ctab.format { drop(1) }.perRowCol { row, col ->
    val y = col.name().toInt()
    val x = row.ground_truth
    val k = 1.0 - abs(x - y) / 10.0
    background(RGBColor(50, (50 + k * 200).toInt().toShort(), 50))
}.toHtml()

In [36]:
val predDf2 = predDf.add("avg_dev") { abs(predicted - ground_truth) }

In [37]:
predDf2.avg_dev.cast<Double>().describe()

name,type,count,unique,nulls,top,freq,mean,std,min,p25,median,p75,max
avg_dev,Int,319,3,0,0,196,0.407524,0.535007,0,0.0,0.0,1.0,2


In [38]:
predDf2.sortBy { avg_dev }[(0.7 * (319 - 1)).toInt()]

predicted,ground_truth,avg_dev
6,5,1


In [39]:
model2.close()