# Compare Benchmarks with each other
This notebook demonstrates how you can analyze results for the two benchmark functions, one measuring performance for some baseline implementation, and another for the proposed optimized implementation.

Such an approach could be handy when it is possible to use two alternative implementations simultaneously (which is usually the case).

The notebook assumes that benchmark functions for baseline implementation has the "Baseline" suffix in their names, and optimized (or changed) alternative implementations has the "Optimized" suffix in their names. For example, `invSqrtBaseline` and `invSqrtOptimized`.

While this example uses a JVM-only project, the notebook could be applied to results collected from multiplatform benchmarks as well.

First, you need to run benchmarks. This can be done by running the following command from the root of the project:

```shell
> ./gradlew :examples:kotlin-jvm-compare-hypothesis:benchmark
```

Once it is completed, run this notebook, and it will automatically find the latest result.

In [1]:
%use serialization, dataframe, kandy

In [2]:
// Serialization classes matching the JMH-alike JSON format.
// We define these classes manually so we can keep `params` as a JsonObject, as it means we can handle them
// in a generic manner. If you benchmark have fixed params, using `"<jsonText>".deserializeThis()` is
// faster and easier.

@Serializable
public data class Benchmark(
    public val benchmark: String,
    public val mode: String,
    public val warmupIterations: Int,
    public val warmupTime: String,
    public val measurementIterations: Int,
    public val measurementTime: String,
    public val primaryMetric: PrimaryMetric,
    public val secondaryMetrics: Map<String, PrimaryMetric>,
    public val params: JsonObject? = null
)

@Serializable
public data class PrimaryMetric(
    public val score: Double,
    public val scoreError: Double,
    public val scoreConfidence: List<Double>,
    public val scorePercentiles: Map<String, Double>,
    public val scoreUnit: String,
    public val rawData: List<List<Double>>,
)

In [3]:
// Benchmarks for a "baseline" implementation have a "Baseline" suffix in their names,
// while benchmarks for an "opimized" implementation have a "Optimized" suffix.
val baselineSuffix = "Baseline"
val optimizedSuffix = "Optimized"

In [4]:
import kotlinx.serialization.json.Json
import java.nio.file.Files
import java.nio.file.attribute.BasicFileAttributes
import kotlin.io.path.exists
import kotlin.io.path.forEachDirectoryEntry
import kotlin.io.path.isDirectory
import kotlin.io.path.listDirectoryEntries
import kotlin.io.path.readText

// Find latest result file, based on the their timestamp.
val runsDir = notebook.workingDir.resolve("kotlin-jvm-compare-hypothesis/build/reports/benchmarks/main")
val lastRunDir = runsDir.listDirectoryEntries()
    .filter { it.isDirectory() }
    .sortedByDescending { dir -> Files.readAttributes(dir, BasicFileAttributes::class.java).creationTime() }
    .first()
val outputFile = lastRunDir.resolve("main.json")
val json = Json { ignoreUnknownKeys = true }
val benchmarkData = json.decodeFromString<List<Benchmark>>(outputFile.readText())

In [5]:
import kotlinx.serialization.json.*

// Helper class for tracking the information we need to use.
data class Benchmark(val name: String, val params: String, val score: Double, val error: Double, val unit: String)

// Split benchmark results into groups. Generally, each group consist of all tests from one test file,
// except when it is an parameterized test. In this case, each test (with all its variants) are put
// in its own group.
val benchmarkGroups = benchmarkData
    .groupBy {
        if (it.benchmark.endsWith(optimizedSuffix))
            it.benchmark.removeSuffix(optimizedSuffix)
        else
            it.benchmark.removeSuffix(baselineSuffix)
    }
    .mapValues { group ->
        val benchmarks = group.value.map { benchmark ->
            // Parameters are specific to each test. `deserializeJson()` will generate the appropriate data classes,
            // but for generic handling of parameters we would need to fallback to reading the JSON. In this case
            // we just handle them through the typed API.
            val paramInfo = benchmark.params?.entries.orEmpty()
                .sortedBy { it.key }
                .joinToString(",") { "${it.key}=${it.value.jsonPrimitive.content}" }
            val name = benchmark.benchmark
            Benchmark(
                name,
                paramInfo,
                benchmark.primaryMetric.score,
                benchmark.primaryMetric.scoreError,
                benchmark.primaryMetric.scoreUnit
            )
        }
        val baseline = benchmarks.filter { it.name.endsWith("Baseline") }.toDataFrame()
        val optimized = benchmarks.filter { it.name.endsWith("Optimized") }.toDataFrame()
        baseline.join(optimized, "params")
    }

// Un-commont this to see the benchmark data as DataFrames
// benchmarkGroups.forEach {
//     DISPLAY(it.value)
// }

In [6]:
// Prepare the data frames for plotting by:
// - Add calculated columns for errorMin / errorMax, for both the baseline and optimized "versions"
// - Tests with parameters use the parameter values as the label
// - Tests without paramaters use the test name as the label
val plotData = benchmarkGroups.mapValues {
    it.value
        .add("errorMin") { it.getValue<Double>("score") - it.getValue<Double>("error") }
        .add("errorMax") { it.getValue<Double>("score") + it.getValue<Double>("error") }
        .add("errorMin1") { it.getValue<Double>("score1") - it.getValue<Double>("error1") }
        .add("errorMax1") { it.getValue<Double>("score1") + it.getValue<Double>("error1") }
        .add("diff") { (it.getValue<Double>("score1") - it.getValue<Double>("score")) / it.getValue<Double>("score")  * 100.0 }
        .insert("label") {
            // Re-format the benchmark labels to make them look "nicer"
            if (!it.getValue<String>("params").isBlank()) {
                it.getValue<String>("params").replace(",", "\n")
            } else {
                it.getValue<String>("name").substringAfterLast(".").removeSuffix(baselineSuffix)
            }
        }.at(0)
        .add("barColor") {
            val diff = get("diff") as Double
            val interval1 = (get("errorMin") as Double)..(get("errorMax") as Double)
            val interval2 = (get("errorMin1") as Double)..(get("errorMax1") as Double)
            val overlap = interval1.start <= interval2.endInclusive && interval2.start <= interval1.endInclusive
            when {
                overlap -> "grey"
                diff > 0 -> "green"
                else -> "red"
            }
        }
        .remove("name", "params")
}

In [7]:
import org.jetbrains.letsPlot.Geom
import org.jetbrains.letsPlot.core.spec.plotson.coord
import org.jetbrains.letsPlot.themes.margin

// Plot each group as a bar plot with the error displayed as error bars.
// This approach assumes that each group has tests roughly within the same "scale".
// If this is not the case, some plots might look very squished. If this happens,
// you can play around with using a LOG10 scale or modifying the limits to focus
// on the changes.
plotData.forEach { (fileName, dataframe) ->
    val plot = dataframe.plot {
        bars {
            x("label") {
                axis.name = ""
            }
            y("diff")
            fillColor("barColor") {
                scale = categorical("red" to Color.RED, "green" to Color.GREEN, "grey" to Color.GREY)
                legend.type = LegendType.None
            }
        }
        coordinatesTransformation = CoordinatesTransformation.cartesianFlipped()
        layout {
            this.yAxisLabel = "Diff, %"
            style {
                global {
                    title {
                        margin(10.0, -10.0)
                    }
                    text {
                        fontFamily = FontFamily.MONO
                    }
                }
            }
            // Adjust the height of the Kandy plot based on the number of tests.
            size = 800 to ((50 * dataframe.size().nrow) + 100)
        }
    }
    DISPLAY(HTML("<h4 >$fileName</h4>"))
    DISPLAY(plot)
}