/- # DataFrame Implementation in Lean A formal implementation of DataFrame data structure for data visualization tools. The DataFrame is designed to be a foundation for plotting functions of type: f : DataFrame โ†’ ๐•‹ Mark ## Overview This implementation provides: - Type-safe cell values (Int, Float, String, Bool, None) - DataFrame operations (filter, select, map, aggregate) - CSV parsing capabilities - Formatted display functionality -/ /- ## CellValue Definition Represents the types that can be stored in DataFrame cells. This reduces the generality to basic types but can be extended by adding new types that implement Repr, BEq, and ToString. -/ inductive CellValue where | int : Int โ†’ CellValue | float : Float โ†’ CellValue | string : String โ†’ CellValue | bool : Bool โ†’ CellValue | none : CellValue deriving Repr, BEq -- ToString instance for CellValue instance : ToString CellValue where toString | CellValue.int i => toString i | CellValue.float f => toString f | CellValue.string s => s | CellValue.bool b => toString b | CellValue.none => "None" /- ## DataFrame Structure A DataFrame is represented as a collection of named columns and row count. - Column: List of CellValue - DataFrame: Structure with columns (name-value pairs) and row count -/ def Column := List CellValue structure DataFrame where columns : List (String ร— Column) nrows : Nat -- Repr instance for DataFrame instance : Repr DataFrame where reprPrec df _ := "DataFrame with " ++ toString df.columns.length ++ " columns and " ++ toString df.nrows ++ " rows" /- ## DataFrame Operations -/ namespace DataFrame -- ============================================================================ -- BASIC OPERATIONS -- ============================================================================ -- Create empty DataFrame def empty : DataFrame := { columns := [], nrows := 0 } -- Helper function to get element from list by index def getAt : List ฮฑ โ†’ Nat โ†’ Option ฮฑ | [], _ => none | x :: _, 0 => some x | _ :: xs, n + 1 => getAt xs n -- Find column by name def findColumn (columns : List (String ร— Column)) (name : String) : Option Column := match columns.find? (fun (colName, _) => colName = name) with | none => none | some (_, col) => some col -- Update or add column def updateColumn (columns : List (String ร— Column)) (name : String) (newCol : Column) : List (String ร— Column) := match columns.findIdx? (fun (colName, _) => colName = name) with | none => columns ++ [(name, newCol)] -- Add new column | some idx => columns.set idx (name, newCol) -- Update existing column -- ============================================================================ -- CORE DATAFRAME OPERATIONS -- ============================================================================ -- Add column to DataFrame def addColumn (df : DataFrame) (name : String) (col : Column) : DataFrame := if col.length = df.nrows โˆจ df.nrows = 0 then { columns := updateColumn df.columns name col, nrows := if df.nrows = 0 then col.length else df.nrows } else df -- Return original DataFrame if size doesn't match -- Get column by name def getColumn (df : DataFrame) (name : String) : Option Column := findColumn df.columns name -- Get number of columns def ncols (df : DataFrame) : Nat := df.columns.length -- Get column names def columnNames (df : DataFrame) : List String := df.columns.map (fun (name, _) => name) -- Get specific cell (row, column) def getCell (df : DataFrame) (row : Nat) (colName : String) : Option CellValue := match df.getColumn colName with | none => none | some col => getAt col row -- Get complete row def getRow (df : DataFrame) (row : Nat) : List (String ร— CellValue) := df.columns.filterMap fun (colName, col) => match getAt col row with | none => none | some val => some (colName, val) -- ============================================================================ -- ADVANCED OPERATIONS -- ============================================================================ -- Filter rows based on predicate def filter (df : DataFrame) (predicate : List (String ร— CellValue) โ†’ Bool) : DataFrame := let validRows := (List.range df.nrows).filter fun i => predicate (df.getRow i) let newColumns := df.columns.map fun (colName, col) => let filteredCol := validRows.filterMap fun i => getAt col i (colName, filteredCol) { columns := newColumns, nrows := validRows.length } -- Select specific columns def select (df : DataFrame) (colNames : List String) : DataFrame := let selectedCols := df.columns.filter fun (name, _) => colNames.contains name let orderedCols := colNames.filterMap fun name => selectedCols.find? fun (colName, _) => colName = name { columns := orderedCols, nrows := df.nrows } -- Map function over column def mapColumn (df : DataFrame) (colName : String) (f : CellValue โ†’ CellValue) : DataFrame := match df.getColumn colName with | none => df | some col => let newCol := col.map f df.addColumn colName newCol -- Apply aggregation function to column def aggregate (df : DataFrame) (colName : String) (f : List CellValue โ†’ CellValue) : Option CellValue := match df.getColumn colName with | none => none | some col => some (f col) -- ============================================================================ -- AGGREGATION FUNCTIONS -- ============================================================================ -- Helper function to convert Int to Float def intToFloat (i : Int) : Float := Float.ofInt i -- Sum aggregation def sum (values : List CellValue) : CellValue := values.foldl (fun acc val => match acc, val with | CellValue.int a, CellValue.int b => CellValue.int (a + b) | CellValue.float a, CellValue.float b => CellValue.float (a + b) | CellValue.int a, CellValue.float b => CellValue.float (intToFloat a + b) | CellValue.float a, CellValue.int b => CellValue.float (a + intToFloat b) | _, _ => acc ) (CellValue.int 0) -- Count aggregation def count (values : List CellValue) : CellValue := CellValue.int values.length -- ============================================================================ -- DISPLAY FUNCTIONS -- ============================================================================ -- Helper function to pad string to fixed width def padString (s : String) (width : Nat) : String := let len := s.length if len >= width then s else s ++ String.mk (List.replicate (width - len) ' ') -- Display DataFrame in formatted way def display (df : DataFrame) : String := let colNames := df.columnNames let colWidth := 12 -- Header let header := colNames.foldl (fun acc name => acc ++ padString name colWidth) "" -- Rows let rows := (List.range df.nrows).map fun i => let rowData := df.getRow i colNames.foldl (fun acc colName => let cellValue := rowData.find? (fun (name, _) => name = colName) match cellValue with | none => acc ++ padString "None" colWidth | some (_, val) => acc ++ padString (toString val) colWidth ) "" -- Separator line let separator := String.mk (List.replicate (colWidth * colNames.length) '-') " " ++ header ++ " " ++ separator ++ " " ++ String.intercalate " " rows -- ToString instance for DataFrame instance : ToString DataFrame where toString := display -- ============================================================================ -- CSV PARSING -- ============================================================================ section CSV -- Helper function to split string by delimiter private def splitString (s : String) (delim : Char) : List String := let chars := s.toList let rec splitRec (acc : List String) (current : List Char) (remaining : List Char) : List String := match remaining with | [] => if current.isEmpty then acc.reverse else (String.mk current.reverse :: acc).reverse | c :: rest => if c = delim then splitRec (String.mk current.reverse :: acc) [] rest else splitRec acc (c :: current) rest splitRec [] [] chars -- Helper function to trim whitespace private def trimString (s : String) : String := let chars := s.toList let rec trimLeft : List Char โ†’ List Char | [] => [] | ' ' :: rest => trimLeft rest | '\t' :: rest => trimLeft rest | '\n' :: rest => trimLeft rest | '\r' :: rest => trimLeft rest | list => list let rec trimRight : List Char โ†’ List Char | [] => [] | list => let reversed := list.reverse let trimmed := trimLeft reversed trimmed.reverse String.mk (trimRight (trimLeft chars)) -- Try to parse value as Float (simplified implementation) private def tryParseFloat (s : String) : Option Float := if s.any (ยท == '.') then let parts := splitString s '.' match parts with | [intPart, fracPart] => match intPart.toInt?, fracPart.toInt? with | some i, some f => let fracDigits := fracPart.length let fracValue := intToFloat f / intToFloat (10 ^ fracDigits) some (intToFloat i + fracValue) | _, _ => none | _ => none else none -- Try to parse value as Bool private def tryParseBool (s : String) : Option Bool := let trimmed := trimString s let lower := trimmed.toLower if lower == "true" || lower == "1" then some true else if lower == "false" || lower == "0" then some false else none -- Infer and convert string value to CellValue private def parseCell (s : String) : CellValue := let trimmed := trimString s -- Empty case if trimmed.isEmpty then CellValue.none -- Try Bool first else match tryParseBool trimmed with | some b => CellValue.bool b -- Try Int | none => match trimmed.toInt? with | some i => CellValue.int i -- Try Float | none => match tryParseFloat trimmed with | some f => CellValue.float f -- Default to String | none => CellValue.string trimmed -- Parse CSV line private def parseCsvLine (line : String) : List String := splitString line ',' -- Main function to parse CSV def fromCSV (csvContent : String) : DataFrame := let lines := splitString csvContent '\n' let nonEmptyLines := lines.filter (fun line => !(trimString line).isEmpty) match nonEmptyLines with | [] => DataFrame.empty | headerLine :: dataLines => let headers := (parseCsvLine headerLine).map trimString let rows := dataLines.map parseCsvLine -- Check if all lines have same number of columns let expectedCols := headers.length let validRows := rows.filter (fun row => row.length == expectedCols) if validRows.isEmpty then -- Return empty DataFrame with headers let emptyColumns := headers.map (fun name => (name, ([] : Column))) { columns := emptyColumns, nrows := 0 } else -- Build columns let nrows := validRows.length let columns := headers.zipIdx.map (fun (name, idx) => let columnValues := validRows.map (fun row => match row[idx]? with | some cellStr => parseCell cellStr | none => CellValue.none ) (name, columnValues) ) { columns := columns, nrows := nrows } end CSV end DataFrame -- ============================================================================ -- EXAMPLES AND TESTS -- ============================================================================ -- Basic example def exemplo : DataFrame := let df := DataFrame.empty let df := df.addColumn "nome" [CellValue.string "Joรฃo", CellValue.string "Maria", CellValue.string "Pedro"] let df := df.addColumn "idade" [CellValue.int 25, CellValue.int 30, CellValue.int 28] let df := df.addColumn "salario" [CellValue.float 5000.0, CellValue.float 6000.0, CellValue.float 5500.0] df -- Basic tests #eval toString exemplo #eval exemplo.ncols #eval exemplo.nrows #eval exemplo.columnNames #eval exemplo.getCell 0 "nome" #eval exemplo.aggregate "idade" DataFrame.sum -- Filtering example (people with age > 26) def exemploFiltrado : DataFrame := exemplo.filter fun row => match row.find? (fun (name, _) => name = "idade") with | none => false | some (_, CellValue.int age) => age > 26 | _ => false #eval toString exemploFiltrado -- Column selection example def exemploSelecionado : DataFrame := exemplo.select ["nome", "salario"] #eval toString exemploSelecionado -- Mapping example (increase salary by 10%) def exemploMapeado : DataFrame := exemplo.mapColumn "salario" fun val => match val with | CellValue.float x => CellValue.float (x * 1.1) | CellValue.int x => CellValue.float (DataFrame.intToFloat x * 1.1) | _ => val #eval toString exemploMapeado -- ============================================================================ -- CSV EXAMPLES -- ============================================================================ -- CSV example 1 def csvExample1 : String := "nome,idade,salario,ativo\nJoรฃo,25,5000.5,true\nMaria,30,6000.0,false\nPedro,28,5500.75,true" def dfFromCsv1 : DataFrame := DataFrame.fromCSV csvExample1 -- CSV example 2 def csvExample2 : String := "produto,preco,quantidade,disponivel\nNotebook,2500,10,true\nMouse,45,100,true\nTeclado,150,0,false" def dfFromCsv2 : DataFrame := DataFrame.fromCSV csvExample2 -- CSV example with missing data def csvExample3 : String := "nome,idade,cidade\nAna,25,Sรฃo Paulo\nCarlos,,Rio de Janeiro\nLuiza,35," def dfFromCsv3 : DataFrame := DataFrame.fromCSV csvExample3 #eval toString dfFromCsv3 -- CSV example 4 def csvExample4 : String := "nome,idade,salario,ativo Lucas,32,4720.25,true Ana,27,5890.0,false Rafael,40,7200.75,true Beatriz,22,4100.0,true Carlos,35,6500.5,false" def dfFromCsv4 : DataFrame := DataFrame.fromCSV csvExample4 #eval toString dfFromCsv4 /- # DataFrames - Theoretical Background ## What is a DataFrame? A DataFrame is a tabular data structure containing named columns and associated data. We seek to represent something like: ```lean structure DataFrame where columns : List String lines : List (List ฮฑ) -- (ฮฑ โˆˆ Type) ? ``` ## Category Theory and DataFrames We can try to formalize DataFrame as a Functor D: ๐““ : ๐’ž โŸถ FinSet Where ๐’ž is a discrete category where objects are column names, and distinct objects have no morphisms between them. The image of an object ๐’ธ โˆˆ ๐’ž representing a column is a finite set of column data ๐““(๐’ธ) โˆˆ FinSet. -/