In [1]:
%use dataframe(0.14.0-RC1)

## Overview of new features in 0.14.0 dev version 
DataFrame.castTo(df)

DataFrame.readExcel new StringColumns parameter

DataFrame.generateDataClasses()

DataRowSchema API

Iterable<T>.toDataFrame(columnName)

## `castTo` helps to reuse code in notebooks
Let's say you have a few dataframes with the same schema. Often you'd need to execute the same code for each of them.
Why? For example, to draw a plot for different periods of time, or process and print information from different sources.
 

In [2]:
val sample = DataFrame.readDelimStr("""full_name,html_url,stargazers_count,topics,watchers
JetBrains/JPS,https://github.com/JetBrains/JPS,23,"[build-system]",23
""")
sample

full_name,html_url,stargazers_count,topics,watchers
JetBrains/JPS,https://github.com/JetBrains/JPS,23,[build-system],23


In [3]:
// Function will throw an exception if schema of `this` don't match schema of `sample`
fun AnyFrame.top10() = castTo(sample).sortByDesc { stargazers_count }.take(10)

In [4]:
val repos = DataFrame.read("https://raw.githubusercontent.com/Kotlin/dataframe/f72655be9a6235eefa183de22f4e1c94ac539f02/data/jetbrains_repositories.csv")
repos.top10()

full_name,html_url,stargazers_count,topics,watchers
JetBrains/kotlin,https://github.com/JetBrains/kotlin,39402,"[compiler, gradle-plugin, intellij-pl...",39402
JetBrains/intellij-community,https://github.com/JetBrains/intellij...,12926,"[code-editor, ide, intellij, intellij...",12926
JetBrains/kotlin-native,https://github.com/JetBrains/kotlin-n...,7101,"[c, compiler, kotlin, llvm, objective-c]",7101
JetBrains/compose-jb,https://github.com/JetBrains/compose-jb,6805,"[android, awt, compose, declarative-u...",6805
JetBrains/ideavim,https://github.com/JetBrains/ideavim,6120,"[ideavim, intellij, intellij-platform...",6120
JetBrains/JetBrainsMono,https://github.com/JetBrains/JetBrain...,6059,"[coding-font, font, ligatures, monosp...",6059
JetBrains/Exposed,https://github.com/JetBrains/Exposed,5688,"[dao, kotlin, orm, sql]",5688
JetBrains/ring-ui,https://github.com/JetBrains/ring-ui,2836,"[components, jetbrains-ui, react]",2836
JetBrains/kotlinconf-app,https://github.com/JetBrains/kotlinco...,2628,[],2628
JetBrains/create-react-kotlin-app,https://github.com/JetBrains/create-r...,2424,"[create-react-app, jetbrains-ui, kotl...",2424


In [5]:
// If there are mismatches in columns function shouldn't care about, use verify = false
fun AnyFrame.top10_noVerify() = castTo(sample, verify = false).sortByDesc { stargazers_count }.take(10)

repos.update { watchers }.withNull().top10_noVerify()

full_name,html_url,stargazers_count,topics,watchers
JetBrains/kotlin,https://github.com/JetBrains/kotlin,39402,"[compiler, gradle-plugin, intellij-pl...",
JetBrains/intellij-community,https://github.com/JetBrains/intellij...,12926,"[code-editor, ide, intellij, intellij...",
JetBrains/kotlin-native,https://github.com/JetBrains/kotlin-n...,7101,"[c, compiler, kotlin, llvm, objective-c]",
JetBrains/compose-jb,https://github.com/JetBrains/compose-jb,6805,"[android, awt, compose, declarative-u...",
JetBrains/ideavim,https://github.com/JetBrains/ideavim,6120,"[ideavim, intellij, intellij-platform...",
JetBrains/JetBrainsMono,https://github.com/JetBrains/JetBrain...,6059,"[coding-font, font, ligatures, monosp...",
JetBrains/Exposed,https://github.com/JetBrains/Exposed,5688,"[dao, kotlin, orm, sql]",
JetBrains/ring-ui,https://github.com/JetBrains/ring-ui,2836,"[components, jetbrains-ui, react]",
JetBrains/kotlinconf-app,https://github.com/JetBrains/kotlinco...,2628,[],
JetBrains/create-react-kotlin-app,https://github.com/JetBrains/create-r...,2424,"[create-react-app, jetbrains-ui, kotl...",


In [6]:
// If types of columns that function uses don't match and verify = false, expect NPE and ClassCast exceptions
fun AnyFrame.filter_noVerify() = castTo(sample, verify = false).filter { watchers > 10 }.take(10)

// exception is expected here
repos.update { watchers }.withNull().filter_noVerify()

java.lang.NullPointerException: null cannot be cast to non-null type kotlin.Int

## generateDataClasses
[Issue 344](https://github.com/Kotlin/dataframe/issues/344) 

[PR 763](https://github.com/Kotlin/dataframe/pull/763)

This new function can generate data class declarations that match DataFrame schema, including FrameColumn and ColumnGroup

In [7]:
val repos1 = repos.groupBy { expr("thousands") { stargazers_count / 1000 } }.toDataFrame()
repos1.schema()

thousands: Int
group: *
    full_name: String
    html_url: URL
    stargazers_count: Int
    topics: String
    watchers: Int


In [8]:
repos1.generateDataClasses(markerName = "RepositoriesData")

@DataSchema
data class RepositoriesData1(
    @ColumnName("full_name")
    val fullName: String,
    @ColumnName("html_url")
    val htmlUrl: java.net.URL,
    @ColumnName("stargazers_count")
    val stargazersCount: Int,
    val topics: String,
    val watchers: Int
)

@DataSchema
data class RepositoriesData(
    val group: List<RepositoriesData1>,
    val thousands: Int
)

In [9]:
// There's no need to copy-paste generated code into the notebook. It can be executed:
EXECUTE(repos1.generateDataClasses(markerName = "RepositoriesData").value)

A dataframe can then be converted to these data classes

In [10]:
val repositoriesDataList = repos1.toListOf<RepositoriesData>()
repositoriesDataList.forEach { 
    println(it.thousands)
}

0
6
1
12
39
5
7
2


A list of data class instances can be converted to DataFrame too

In [11]:
val dataframe = RepositoriesData1(
    "Kotlin/dataframe",
    URL("https://github.com/Kotlin/dataframe"),
    stargazersCount = 773,
    topics = "[kotlin, data-science, data-analysis, dataframe]",
    watchers = 16
)
val repos2 = listOf(RepositoriesData(listOf(dataframe), 0)).toDataFrame()
repos2

group,thousands,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
topics,watchers,full_name,html_url,stargazers_count
"DataFrame [1 x 5]topicswatchersfull_namehtml_urlstargazers_count[kotlin, data-science, data-analysis,...16Kotlin/dataframehttps://github.com/Kotlin/dataframe773",0,,,
topics,watchers,full_name,html_url,stargazers_count
"[kotlin, data-science, data-analysis,...",16,Kotlin/dataframe,https://github.com/Kotlin/dataframe,773

topics,watchers,full_name,html_url,stargazers_count
"[kotlin, data-science, data-analysis,...",16,Kotlin/dataframe,https://github.com/Kotlin/dataframe,773


In [12]:
repos2.schema() == repos1.schema()

true

## DataRowSchema API
[Issue 113](https://github.com/Kotlin/dataframe/issues/113)

We introduce a marker interface that data schema classes can implement. It will enable two new API: dataFrameOf and append.
With the [compiler plugin](https://github.com/Kotlin/dataframe/issues/704), this interface will be automatically added to any declaration annotated with @DataSchema. For now let's see what it does:

In [13]:
@DataSchema
data class RepositoriesData1(
    @ColumnName("full_name")
    val fullName: String,
    @ColumnName("html_url")
    val htmlUrl: java.net.URL,
    @ColumnName("stargazers_count")
    val stargazersCount: Int,
    val topics: String,
    val watchers: Int
)

@DataSchema
data class RepositoriesData(
    val group: List<RepositoriesData1>,
    val thousands: Int
) : DataRowSchema // <<-- New marker interface 

In [14]:
val element = RepositoriesData1(
    "Kotlin/dataframe",
    URL("https://github.com/Kotlin/dataframe"),
    stargazersCount = 773,
    topics = "[kotlin, data-science, data-analysis, dataframe]",
    watchers = 16
)
val df = dataFrameOf(RepositoriesData(listOf(element), 0)) // shorter compared to listOf().toDataFrame() above
df.append(RepositoriesData(listOf(element), 0)) // new typed flavor of append

group,thousands,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
topics,watchers,full_name,html_url,stargazers_count
topics,watchers,full_name,html_url,stargazers_count
"DataFrame [1 x 5]topicswatchersfull_namehtml_urlstargazers_count[kotlin, data-science, data-analysis,...16Kotlin/dataframehttps://github.com/Kotlin/dataframe773",0,,,
topics,watchers,full_name,html_url,stargazers_count
"[kotlin, data-science, data-analysis,...",16,Kotlin/dataframe,https://github.com/Kotlin/dataframe,773
"DataFrame [1 x 5]topicswatchersfull_namehtml_urlstargazers_count[kotlin, data-science, data-analysis,...16Kotlin/dataframehttps://github.com/Kotlin/dataframe773",0,,,
topics,watchers,full_name,html_url,stargazers_count
"[kotlin, data-science, data-analysis,...",16,Kotlin/dataframe,https://github.com/Kotlin/dataframe,773

topics,watchers,full_name,html_url,stargazers_count
"[kotlin, data-science, data-analysis,...",16,Kotlin/dataframe,https://github.com/Kotlin/dataframe,773

topics,watchers,full_name,html_url,stargazers_count
"[kotlin, data-science, data-analysis,...",16,Kotlin/dataframe,https://github.com/Kotlin/dataframe,773


## DataFrame.readExcel(..., stringColumns) parameter
[Issue 150](https://github.com/Kotlin/dataframe/issues/150)

It gives you more control on how values should be interpreted. Check an example below

In [15]:
// By default readExcel relies on cell value types
val excel1 = DataFrame.readExcel("mixed_column.xlsx")
excel1

2024-09-17T13:59:21.670459871Z Execution of code '// By default readEx...' ERROR Log4j2 could not find a logging implementation. Please add log4j-core to the classpath. Using SimpleLogger to log to the console...


col1
100.000000
A100
B100
C100


In [16]:
// col1 is Double & String
excel1.schema()

col1: Comparable<*>

In [17]:
val excel2 = DataFrame.readExcel("mixed_column.xlsx", stringColumns = StringColumns("A"))
excel2

col1
100
A100
B100
C100


In [18]:
excel2.schema()

col1: String

## Iterable<T>.toDataFrame(columnName)
This is an easy way to create a DataFrame when you have a list of Files, URLs, or a structure
you want to extract data from.
In a notebook,
it can be convenient to start from the column of these values to see the number of rows, their `toString` in a table
and then iteratively add columns with the parts of the data you're interested in.
It could be a File's content, a specific section of an HTML document, some metadata, etc.

In [19]:
import kotlin.io.path.Path
import kotlin.io.path.listDirectoryEntries

val csvs = Path(".").listDirectoryEntries("*.csv").toDataFrame(columnName = "file")
csvs

file
./data.csv
./data1.csv


In [20]:
import kotlin.io.path.fileSize

csvs.add { 
    "size" from { file.fileSize() }
    "data" from { file.toFile().readDataFrame() }
}

file,size,data
a,b,c
col1,col2,col3
./data.csv,12,DataFrame [1 x 3]abc123
a,b,c
1,2,3
./data1.csv,27,DataFrame [1 x 3]col1col2col33.000000str1
col1,col2,col3
3.000000,str,1

a,b,c
1,2,3

col1,col2,col3
3.0,str,1
