# Predict wine quality

Use the latest versions of DataFrame and KotlinDL libraries from [version repository](https://github.com/Kotlin/kotlin-jupyter-libraries).

In [1]:
%useLatestDescriptors
%use dataframe, kotlin-dl

Read the dataframe from CSV and print the first few lines of it

In [2]:
val raw_df = DataFrame.readCSV(fileOrUrl = "winequality-red.csv", delimiter = ';')
raw_df.head()

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Note: For formatting, the DataFrame needs to be rendered as HTML. This means that when running in Kotlin Notebook, "Render DataFrame tables natively" needs to be turned off.

In [3]:
raw_df.corr().format { colsOf<Double>() }.with { 
    linearBg(value = it, from = -1.0 to red, to = 1.0 to green)
}

column,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.256131,0.671703,0.114777,0.093705,-0.153794,-0.113181,0.668047,-0.682978,0.183006,-0.061668,0.124052
volatile acidity,-0.256131,1.0,-0.552496,0.001918,0.061298,-0.010504,0.07647,0.022026,0.234937,-0.260987,-0.202288,-0.390558
citric acid,0.671703,-0.552496,1.0,0.143577,0.203823,-0.060978,0.035533,0.364947,-0.541904,0.31277,0.109903,0.226373
residual sugar,0.114777,0.001918,0.143577,1.0,0.05561,0.187049,0.203028,0.355283,-0.085652,0.005527,0.042075,0.013732
chlorides,0.093705,0.061298,0.203823,0.05561,1.0,0.005562,0.0474,0.200632,-0.265026,0.37126,-0.221141,-0.128907
free sulfur dioxide,-0.153794,-0.010504,-0.060978,0.187049,0.005562,1.0,0.667666,-0.021946,0.070377,0.051658,-0.069408,-0.050656
total sulfur dioxide,-0.113181,0.07647,0.035533,0.203028,0.0474,0.667666,1.0,0.071269,-0.066495,0.042947,-0.205654,-0.1851
density,0.668047,0.022026,0.364947,0.355283,0.200632,-0.021946,0.071269,1.0,-0.341699,0.148506,-0.49618,-0.174919
pH,-0.682978,0.234937,-0.541904,-0.085652,-0.265026,0.070377,-0.066495,-0.341699,1.0,-0.196648,0.205633,-0.057731
sulphates,0.183006,-0.260987,0.31277,0.005527,0.37126,0.051658,0.042947,0.148506,-0.196648,1.0,0.093595,0.251397


Based on the correlation, we can remove some columns, they seem to be insignificant

In [4]:
val df = raw_df.remove { `free sulfur dioxide` and `residual sugar` and pH }
df

fixed acidity,volatile acidity,citric acid,chlorides,total sulfur dioxide,density,sulphates,alcohol,quality
7.4,0.7,0.0,0.076,34.0,0.9978,0.56,9.4,5
7.8,0.88,0.0,0.098,67.0,0.9968,0.68,9.8,5
7.8,0.76,0.04,0.092,54.0,0.997,0.65,9.8,5
11.2,0.28,0.56,0.075,60.0,0.998,0.58,9.8,6
7.4,0.7,0.0,0.076,34.0,0.9978,0.56,9.4,5
7.4,0.66,0.0,0.075,40.0,0.9978,0.56,9.4,5
7.9,0.6,0.06,0.069,59.0,0.9964,0.46,9.4,5
7.3,0.65,0.0,0.065,21.0,0.9946,0.47,10.0,7
7.8,0.58,0.02,0.073,18.0,0.9968,0.57,9.5,7
7.5,0.5,0.36,0.071,102.0,0.9978,0.8,10.5,5


## Predict wine quality: first approach

In [5]:
// Simple converter function between DataFrame and KotlinDL data representations
fun <T> DataFrame<T>.toOnHeapDataset(labelColumnName: String): OnHeapDataset {
    return OnHeapDataset.create(
        dataframe = this,
        yColumn = labelColumnName
    )
}

fun OnHeapDataset.Companion.create(
    dataframe: DataFrame<Any?>,
    yColumn: String
): OnHeapDataset {
    fun extractX(): Array<FloatArray> =
        dataframe.remove(yColumn).rows()
            .map { (it.values() as List<Float>).toFloatArray() }.toTypedArray()

    fun extractY(): FloatArray =
        dataframe.get { yColumn<Float>() }.toList().toFloatArray()

    return create(
        ::extractX,
        ::extractY
    )
}

In [6]:
val (train, test) = df.convert { colsOf<Double>() }.toFloat()
    .toOnHeapDataset(labelColumnName = "quality")
    .split(0.8)

Define simple neural network with only 2 dense layers

In [7]:
val inputNeurons = train.x[0].size.toLong()

val model = Sequential.of(
    Input(
        inputNeurons,
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal(),
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal(),
    ),
    Dense(
        outputSize = 1,
        activation = Activations.Linear,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal(),
    )
)

In [8]:
model.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)

In [9]:
model.printSummary()

Model type: Sequential
______________________________________________________________________________
Layer (type)                           Output Shape              Param #      
input_1(Input)                         [None, 8]                 0            
______________________________________________________________________________
dense_2(Dense)                         [None, 80]                720          
______________________________________________________________________________
dense_3(Dense)                         [None, 80]                6480         
______________________________________________________________________________
dense_4(Dense)                         [None, 1]                 81           
______________________________________________________________________________
Total trainable params: 7281
Total frozen params: 0
Total params: 7281
______________________________________________________________________________


Train it!

In [10]:
val trainHist = model.fit(train, batchSize = 500, epochs=2000)

In [11]:
trainHist.epochHistory.toDataFrame().tail()

epochIndex,lossValue,metricValues,valLossValue,valMetricValues
1996,0.334877,[0.45114806294441223],,[NaN]
1997,0.334841,[0.45111772418022156],,[NaN]
1998,0.334805,[0.45108696818351746],,[NaN]
1999,0.334768,[0.45105621218681335],,[NaN]
2000,0.334732,[0.4510253965854645],,[NaN]


Let's check that our network predicts values more or less correctly:

In [12]:
model.predictSoftly(test.x[9])[0]

5.2477317

In [13]:
test.y[9]

5.0

Close the model:

In [14]:
model.close()

## Predict wine quality: second approach

In [15]:
data class TrainTestSplitResult<T>(
    val trainX: DataFrame<T>,
    val trainY: DataFrame<T>,
    val testX: DataFrame<T>,
    val testY: DataFrame<T>,
)

fun <T> trainTestSplit(
    d: DataFrame<T>,
    col: String,
    trainPart: Double,
): TrainTestSplitResult<T> {
    val n = d.count()
    val trainN = ceil(n * trainPart).toInt()

    val shuffledInd = (0 until n).shuffled()
    val trainInd = shuffledInd.subList(0, trainN)
    val testInd = shuffledInd.subList(trainN, n)

    val train = d[trainInd]
    val test = d[testInd]

    val trainX = train.select { all().except(cols(col)) }
    val trainY = train.select(col)

    val testX = test.select { all().except(cols(col)) }
    val testY = test.select(col)

    return TrainTestSplitResult(trainX, trainY, testX, testY)
}

Let's create and then train the model as we did before

In [16]:
val (trainX, trainY, testX, testY) =
    trainTestSplit(df, "quality", 0.8)

In [17]:
fun <T> DataFrame<T>.toX(): Array<FloatArray> =
    merge { colsOf<Number>() }.by { it.map { it.toFloat() }.toFloatArray() }.into("X")
        .get { "X"<FloatArray>() }
        .toList()
        .toTypedArray()

In [18]:
fun <T> DataFrame<T>.toY(): FloatArray = 
    get { "quality"<Int>() }
        .asIterable()
        .map { it.toFloat() }
        .toFloatArray()

In [19]:
val trainXDL = trainX.toX()
val trainYDL = trainY.toY()
val testXDL = testX.toX()
val testYDL = testY.toY()

In [20]:
val trainKotlinDLDataset = OnHeapDataset.create({ trainXDL }, { trainYDL })
val testKotlinDLDataset = OnHeapDataset.create({ testXDL }, { testYDL })

In [21]:
val inputNeurons = train.x[0].size.toLong()

val model2 = Sequential.of(
    Input(
        inputNeurons
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    ),
    Dense(
        outputSize = (inputNeurons * 10).toInt(),
        activation = Activations.Tanh,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    ),
    Dense(
        outputSize = 1,
        activation = Activations.Linear,
        kernelInitializer = HeNormal(),
        biasInitializer = HeNormal()
    )
)
model2.compile(optimizer = Adam(), loss = Losses.MSE, metric = Metrics.MAE)
model2.printSummary()

Model type: Sequential
______________________________________________________________________________
Layer (type)                           Output Shape              Param #      
input_1(Input)                         [None, 8]                 0            
______________________________________________________________________________
dense_2(Dense)                         [None, 80]                720          
______________________________________________________________________________
dense_3(Dense)                         [None, 80]                6480         
______________________________________________________________________________
dense_4(Dense)                         [None, 1]                 81           
______________________________________________________________________________
Total trainable params: 7281
Total frozen params: 0
Total params: 7281
______________________________________________________________________________


In [22]:
val trainHist = model2.fit(train, batchSize = 500, epochs = 2000)
trainHist.epochHistory.toDataFrame().tail()

epochIndex,lossValue,metricValues,valLossValue,valMetricValues
1996,0.334773,[0.45107388496398926],,[NaN]
1997,0.334737,[0.45104312896728516],,[NaN]
1998,0.3347,[0.45101237297058105],,[NaN]
1999,0.334663,[0.4509815275669098],,[NaN]
2000,0.334626,[0.45095062255859375],,[NaN]


In [23]:
model2.predictSoftly(testXDL[9])[0]

6.6911993

In [24]:
testYDL[9]

7.0

We can also compare predicted and ground truth values to ensure predictions are correct

In [25]:
val predicted = testXDL.mapIndexed { i, _ ->
    round(model2.predictSoftly(testXDL[i])[0]).toInt()
}.toColumn("predicted")

val ground_truth = testYDL.mapIndexed { i, _ ->
    testYDL[i].toInt()
}.toColumn("ground_truth")

val predDf = dataFrameOf(predicted, ground_truth)

In [26]:
predDf.head()

predicted,ground_truth
5,5
5,5
6,5
5,5
6,6


In [27]:
val inds = List(10) { it + 1 }
val ctab = predDf
    .groupBy { ground_truth }.pivotCounts(inward = false) { predicted }
    .sortBy { ground_truth }

ctab.format { drop(1) }.perRowCol { row, col ->
    val y = col.name().toInt()
    val x = row.ground_truth
    val k = 1.0 - abs(x - y) / 10.0
    background(RGBColor(50, (50 + k * 200).toInt().toShort(), 50))
}

ground_truth,5,6,4,7
3,2,0,1,0
4,8,2,1,0
5,105,42,1,0
6,34,78,0,5
7,0,20,0,16
8,0,1,0,3


In [28]:
val predDf2 = predDf.add("avg_dev") { abs(predicted - ground_truth) }

In [29]:
predDf2.avg_dev.cast<Double>().describe()

name,type,count,unique,nulls,top,freq,mean,std,min,median,max
avg_dev,Int,319,3,0,0,200,0.388715,0.519432,0,0,2


In [30]:
predDf2.sortBy { avg_dev }[(0.7 * (319 - 1)).toInt()]

predicted,ground_truth,avg_dev
6,5,1


In [31]:
model2.close()