# Compare Benchmark Runs
This notebook demonstrates how you can analyze the differences between two benchmark runs of the same benchmark and find the tests that differ the most, which probably means that they require further analysis to figure out why they changed.

Several projects exist in the `examples` folder, but this notebook assumes we are working on the
JVM part of the `kotlin-multiplatform` project. But the same approach can be used for the other projects.

First, you need to run the benchmark twice. This can be done by running these commands from the root of the project:

```shell
> ./gradlew :examples:kotlin-multiplatform:jvmBenchmark
> ./gradlew :examples:kotlin-multiplatform:jvmBenchmark
```

Once it is completed, run this notebook, and it will automatically find the latest result.

In [1]:
%use serialization, dataframe, kandy

In [2]:
// Serialization classes matching the JMH-alike JSON format.
// We define these classes manually so we can keep `params` as a JsonObject, as it means we can handle them
// in a generic manner. If you benchmark have fixed params, using `"<jsonText>".deserializeThis()` is
// faster and easier.

@Serializable
public data class Benchmark(
    public val benchmark: String,
    public val mode: String,
    public val warmupIterations: Int,
    public val warmupTime: String,
    public val measurementIterations: Int,
    public val measurementTime: String,
    public val primaryMetric: PrimaryMetric,
    public val secondaryMetrics: Map<String, PrimaryMetric>,
    public val params: JsonObject? = null
)

@Serializable
public data class PrimaryMetric(
    public val score: Double,
    public val scoreError: Double,
    public val scoreConfidence: List<Double>,
    public val scorePercentiles: Map<String, Double>,
    public val scoreUnit: String,
    public val rawData: List<List<Double>>,
)

In [3]:
import java.nio.file.Files
import java.nio.file.attribute.BasicFileAttributes
import kotlin.io.path.exists
import kotlin.io.path.forEachDirectoryEntry
import kotlin.io.path.isDirectory
import kotlin.io.path.listDirectoryEntries
import kotlin.io.path.readText

// Find latest result file, based on the their timestamp.
val runsDir = notebook.workingDir.resolve("kotlin-multiplatform/build/reports/benchmarks/main")
val outputFiles = runsDir.listDirectoryEntries()
    .filter { it.isDirectory() }
    .sortedByDescending { dir -> Files.readAttributes(dir, BasicFileAttributes::class.java).creationTime() }
    .subList(0, 2)
    .map { it.resolve("jvm.json") }

In [4]:
// Convert to typed JSON
val json = Json { ignoreUnknownKeys = true }
val newRun = json.decodeFromString<List<Benchmark>>(outputFiles.first().readText())
val oldRun = json.decodeFromString<List<Benchmark>>(outputFiles.last().readText())

In [5]:
// Convert to DataFrames for easier processing. As there is not "id" keys for the benchmark, we invent one by just
// assigning the test row index as their "primary key". We could attempt to use the benchmark name and param values,
// but that is complicated by how paramers are represented in the JSON file. So, since we assume that the two files
// are equal using row index should be safe.
val oldDf = oldRun.toDataFrame().addId("rowIndex")
val newDf = newRun.toDataFrame().addId("rowIndex")

In [6]:
val combinedData = oldDf.innerJoin(newDf) { rowIndex }
// Un-commont this to see the intermediate dataframe:
// combinedData

In [7]:
import kotlinx.serialization.json.encodeToJsonElement

// Reduce the combined data into the exact format we need
val resultData = combinedData.mapToFrame {
    "name" from { it.benchmark }
    "params" from {
        it.params?.entries.orEmpty()
            .sortedBy { it.key }
            .joinToString(",") { entry -> "${entry.key}=${entry.value.jsonPrimitive.content}" }
    }
    "mode" from { it.mode } // "avgt" or "thrpt"
    "unit" from { it.primaryMetric.scoreUnit }
    "runOld" {
        "score" from { it.primaryMetric.score }
        "range" from { it.primaryMetric.scoreConfidence[0]..it.primaryMetric.scoreConfidence[1] }
    }
    "runNew" {
        "score" from { it.primaryMetric1.score }
        "range" from { it.primaryMetric1.scoreConfidence[0]..it.primaryMetric1.scoreConfidence[1] }
    }
}
// Un-commont this to see the intermediate dataframe:
// resultData

In [8]:
// Flatten the data so it is easier to plot
val mergedData = resultData.unfold { runOld and runNew }.flatten()
// Un-commont this to see the intermediate dataframe:
// mergedData

In [9]:
// Before plotting the data, we calculate the change between the two runs. This is saved
// in "scoreDiff". This is done slightly different depending on the test mode:
//
// - "avgt": For the average time we use "oldScore - newScore", so improvements in the
//   benchmark result in positive numbers.
// - "thrpt": For throughput, we use "newScore - oldScore", so improvements here also
//   result in positive numbers.
//
// We also normalize this value as a percentage change from `scoreOld`. This is saved in
// "scoreDiffPercentage".
val plotData = mergedData
    .add("diffScore") {
        when (mode) {
            "avgt" -> score - score1
            "thrpt" -> score1 - score
            else -> error("Unknown mode: $mode")
        }
    }
    .add("diffScorePercentage") {
        (get("diffScore") as Double) * 100.0 / score
    }
    .add("testLabel") {
        if (params.isNullOrBlank()) {
            name
        } else {
            "$name\n[$params]"
        }
    }
    .add("barColor") {
        val value = get("diffScorePercentage") as Double
        if (value < 0.0) "neg" else "pos"
    }
plotData

name,params,mode,unit,score,range,score1,range1,diffScore,diffScorePercentage,testLabel,barColor
test.InheritedBenchmark.baseBenchmark,,thrpt,ops/s,999715.573058,974954.2809652109..1024476.8651513313,1140852.957229,1106659.7386652725..1175046.1757937148,141137.384171,14.117754,test.InheritedBenchmark.baseBenchmark,pos
test.InheritedBenchmark.inheritedBenc...,,thrpt,ops/s,132519854.002753,1.2936737342079675E8..1.3567233458470...,146792560.895807,1.4482756334251586E8..1.4875755844909...,14272706.893054,10.77024,test.InheritedBenchmark.inheritedBenc...,pos
test.ParamBenchmark.mathBenchmark,"data=1,text=a ""string"" with quotes,va...",thrpt,ops/ms,196902.036134,191654.7423267418..202149.32994077116,217133.491804,215027.16316066374..219239.82044761901,20231.45567,10.274884,test.ParamBenchmark.mathBenchmark [da...,pos
test.ParamBenchmark.mathBenchmark,"data=1,text=a ""string"" with quotes,va...",thrpt,ops/ms,198982.020454,194497.3197739599..203466.72113419612,216593.775142,212594.8732753121..220592.677009433,17611.754688,8.850928,test.ParamBenchmark.mathBenchmark [da...,pos
test.ParamBenchmark.mathBenchmark,"data=2,text=a ""string"" with quotes,va...",thrpt,ops/ms,76487.508628,74595.04831374026..78379.96894260854,86523.588517,82315.21514352418..90731.96188974172,10036.079888,13.121201,test.ParamBenchmark.mathBenchmark [da...,pos
test.ParamBenchmark.mathBenchmark,"data=2,text=a ""string"" with quotes,va...",thrpt,ops/ms,77218.536895,75599.16372724227..78837.91006301393,85944.381961,81177.07416021802..90711.68976101496,8725.845065,11.300195,test.ParamBenchmark.mathBenchmark [da...,pos
test.ParamBenchmark.otherBenchmark,"data=1,text=a ""string"" with quotes,va...",thrpt,ops/ms,2312642.773644,2091779.438724716..2533506.1085633924,2616589.19879,2579570.7359826625..2653607.6615965427,303946.425146,13.142818,test.ParamBenchmark.otherBenchmark [d...,pos
test.ParamBenchmark.otherBenchmark,"data=1,text=a ""string"" with quotes,va...",thrpt,ops/ms,2295922.938822,2089935.2366487498..2501910.6409945134,2629054.441671,2524752.102688715..2733356.780654085,333131.50285,14.509699,test.ParamBenchmark.otherBenchmark [d...,pos
test.ParamBenchmark.otherBenchmark,"data=2,text=a ""string"" with quotes,va...",thrpt,ops/ms,2295223.364007,2115768.594299648..2474678.1337146433,2646402.737052,2616649.4130033795..2676156.0611009924,351179.373045,15.300444,test.ParamBenchmark.otherBenchmark [d...,pos
test.ParamBenchmark.otherBenchmark,"data=2,text=a ""string"" with quotes,va...",thrpt,ops/ms,2295128.593211,2082445.0002591067..2507812.186162458,2652047.742142,2627344.681389795..2676750.802894536,356919.148931,15.551161,test.ParamBenchmark.otherBenchmark [d...,pos


In [10]:
import org.jetbrains.kotlinx.kandy.util.color.Color
import org.jetbrains.letsPlot.core.spec.plotson.fill
import org.jetbrains.letsPlot.label.ggtitle
import org.jetbrains.letsPlot.scale.guideLegend
import org.jetbrains.letsPlot.scale.guides

// Now we can plot this data. First we create a basic plot just showing the difference in percent between all scores.
plotData.sortBy { diffScorePercentage }.plot {
    barsH {
        x(diffScorePercentage) {
            axis.name = "Diff %"
        }
        y(testLabel) {
            axis.name = ""
        }
        fillColor(barColor) {
            scale = categorical("neg" to Color.RED, "pos" to Color.GREEN)
            legend.type = LegendType.None
        }
        tooltips {
            line(diffScorePercentage, format = ".2f")
        }
    }
    layout {
        size = 800 to ((40 * plotData.size().nrow) + 100)
        style {
            global {
                title {
                    margin(10.0, 0.0)
                }
            }
        }
    }
}

In [11]:
// Just comparing the score values is a bit simplistic as the benchmark results are actually a range: score +/- error.
// So, instead of plotting all tests, we want to focus only on the benchmarks that looks "interesting". This is
// defined as any benchmark that differ so much that the benchmark ranges do not overlap, i.e., we no longer just
// look at only the score but consider the full error range.
//
// We still use the "score" to calculate the change in percent, but now on a filtered list
fun kotlin.ranges.ClosedFloatingPointRange<kotlin.Double>.overlaps(other: ClosedFloatingPointRange<kotlin.Double>): Boolean =
    this.start <= other.endInclusive && other.start <= this.endInclusive

val interestingBenchmarks = plotData.filter {
    !it.range.overlaps(it.range1)
}
interestingBenchmarks

name,params,mode,unit,score,range,score1,range1,diffScore,diffScorePercentage,testLabel,barColor
test.InheritedBenchmark.baseBenchmark,,thrpt,ops/s,999715.573058,974954.2809652109..1024476.8651513313,1140852.957229,1106659.7386652725..1175046.1757937148,141137.384171,14.117754,test.InheritedBenchmark.baseBenchmark,pos
test.InheritedBenchmark.inheritedBenc...,,thrpt,ops/s,132519854.002753,1.2936737342079675E8..1.3567233458470...,146792560.895807,1.4482756334251586E8..1.4875755844909...,14272706.893054,10.77024,test.InheritedBenchmark.inheritedBenc...,pos
test.ParamBenchmark.mathBenchmark,"data=1,text=a ""string"" with quotes,va...",thrpt,ops/ms,196902.036134,191654.7423267418..202149.32994077116,217133.491804,215027.16316066374..219239.82044761901,20231.45567,10.274884,test.ParamBenchmark.mathBenchmark [da...,pos
test.ParamBenchmark.mathBenchmark,"data=1,text=a ""string"" with quotes,va...",thrpt,ops/ms,198982.020454,194497.3197739599..203466.72113419612,216593.775142,212594.8732753121..220592.677009433,17611.754688,8.850928,test.ParamBenchmark.mathBenchmark [da...,pos
test.ParamBenchmark.mathBenchmark,"data=2,text=a ""string"" with quotes,va...",thrpt,ops/ms,76487.508628,74595.04831374026..78379.96894260854,86523.588517,82315.21514352418..90731.96188974172,10036.079888,13.121201,test.ParamBenchmark.mathBenchmark [da...,pos
test.ParamBenchmark.mathBenchmark,"data=2,text=a ""string"" with quotes,va...",thrpt,ops/ms,77218.536895,75599.16372724227..78837.91006301393,85944.381961,81177.07416021802..90711.68976101496,8725.845065,11.300195,test.ParamBenchmark.mathBenchmark [da...,pos
test.ParamBenchmark.otherBenchmark,"data=1,text=a ""string"" with quotes,va...",thrpt,ops/ms,2312642.773644,2091779.438724716..2533506.1085633924,2616589.19879,2579570.7359826625..2653607.6615965427,303946.425146,13.142818,test.ParamBenchmark.otherBenchmark [d...,pos
test.ParamBenchmark.otherBenchmark,"data=1,text=a ""string"" with quotes,va...",thrpt,ops/ms,2295922.938822,2089935.2366487498..2501910.6409945134,2629054.441671,2524752.102688715..2733356.780654085,333131.50285,14.509699,test.ParamBenchmark.otherBenchmark [d...,pos
test.ParamBenchmark.otherBenchmark,"data=2,text=a ""string"" with quotes,va...",thrpt,ops/ms,2295223.364007,2115768.594299648..2474678.1337146433,2646402.737052,2616649.4130033795..2676156.0611009924,351179.373045,15.300444,test.ParamBenchmark.otherBenchmark [d...,pos
test.ParamBenchmark.otherBenchmark,"data=2,text=a ""string"" with quotes,va...",thrpt,ops/ms,2295128.593211,2082445.0002591067..2507812.186162458,2652047.742142,2627344.681389795..2676750.802894536,356919.148931,15.551161,test.ParamBenchmark.otherBenchmark [d...,pos


In [12]:
// Now lets plot the interesting benchmarks, similar to before.
interestingBenchmarks.sortBy { diffScorePercentage }.plot {
    barsH {
        x(diffScorePercentage) {
            axis.name = "Diff %"
        }
        y(testLabel) {
            axis.name = ""
        }
        fillColor(barColor) {
            scale = categorical("neg" to Color.RED, "pos" to Color.GREEN)
            legend.type = LegendType.None
        }
        tooltips {
            line(diffScorePercentage, format = ".2f")
        }
    }
    layout {
        size = 800 to ((40 * interestingBenchmarks.size().nrow) + 100)
        style {
            global {
                title {
                    margin(10.0, 0.0)
                }
            }
        }
    }
}