{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- age: integer (nullable = true)\n", " |-- job: string (nullable = true)\n", " |-- marital: string (nullable = true)\n", " |-- education: string (nullable = true)\n", " |-- default: string (nullable = true)\n", " |-- balance: integer (nullable = true)\n", " |-- housing: string (nullable = true)\n", " |-- loan: string (nullable = true)\n", " |-- contact: string (nullable = true)\n", " |-- day: integer (nullable = true)\n", " |-- month: string (nullable = true)\n", " |-- duration: integer (nullable = true)\n", " |-- campaign: integer (nullable = true)\n", " |-- pdays: integer (nullable = true)\n", " |-- previous: integer (nullable = true)\n", " |-- poutcome: string (nullable = true)\n", " |-- deposit: string (nullable = true)\n", "\n" ] } ], "source": [ "from pyspark.sql import SparkSession\n", "spark = SparkSession.builder.appName('ml-bank').getOrCreate()\n", "df = spark.read.csv('bank.csv', header = True, inferSchema = True)\n", "df.printSchema()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pandas dataframe is prettier than Spark DataFrame.show()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "
|---|---|---|---|---|---|
| age | \n", "59 | \n", "56 | \n", "41 | \n", "55 | \n", "54 | \n", "
| job | \n", "admin. | \n", "admin. | \n", "technician | \n", "services | \n", "admin. | \n", "
| marital | \n", "married | \n", "married | \n", "married | \n", "married | \n", "married | \n", "
| education | \n", "secondary | \n", "secondary | \n", "secondary | \n", "secondary | \n", "tertiary | \n", "
| default | \n", "no | \n", "no | \n", "no | \n", "no | \n", "no | \n", "
| balance | \n", "2343 | \n", "45 | \n", "1270 | \n", "2476 | \n", "184 | \n", "
| housing | \n", "yes | \n", "no | \n", "yes | \n", "yes | \n", "no | \n", "
| loan | \n", "no | \n", "no | \n", "no | \n", "no | \n", "no | \n", "
| contact | \n", "unknown | \n", "unknown | \n", "unknown | \n", "unknown | \n", "unknown | \n", "
| day | \n", "5 | \n", "5 | \n", "5 | \n", "5 | \n", "5 | \n", "
| month | \n", "may | \n", "may | \n", "may | \n", "may | \n", "may | \n", "
| duration | \n", "1042 | \n", "1467 | \n", "1389 | \n", "579 | \n", "673 | \n", "
| campaign | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "2 | \n", "
| pdays | \n", "-1 | \n", "-1 | \n", "-1 | \n", "-1 | \n", "-1 | \n", "
| previous | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
| poutcome | \n", "unknown | \n", "unknown | \n", "unknown | \n", "unknown | \n", "unknown | \n", "
| deposit | \n", "yes | \n", "yes | \n", "yes | \n", "yes | \n", "yes | \n", "
| \n", " | deposit | \n", "count | \n", "
|---|---|---|
| 0 | \n", "no | \n", "5873 | \n", "
| 1 | \n", "yes | \n", "5289 | \n", "
| \n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "
|---|---|---|---|---|---|
| summary | \n", "count | \n", "mean | \n", "stddev | \n", "min | \n", "max | \n", "
| age | \n", "11162 | \n", "41.231947679627304 | \n", "11.913369192215518 | \n", "18 | \n", "95 | \n", "
| balance | \n", "11162 | \n", "1528.5385235620856 | \n", "3225.413325946149 | \n", "-6847 | \n", "81204 | \n", "
| day | \n", "11162 | \n", "15.658036194230425 | \n", "8.420739541006462 | \n", "1 | \n", "31 | \n", "
| duration | \n", "11162 | \n", "371.99381831213043 | \n", "347.12838571630687 | \n", "2 | \n", "3881 | \n", "
| campaign | \n", "11162 | \n", "2.508421429851281 | \n", "2.7220771816614824 | \n", "1 | \n", "63 | \n", "
| pdays | \n", "11162 | \n", "51.33040673714388 | \n", "108.75828197197717 | \n", "-1 | \n", "854 | \n", "
| previous | \n", "11162 | \n", "0.8325568894463358 | \n", "2.292007218670508 | \n", "0 | \n", "58 | \n", "
| \n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "
|---|---|---|---|---|---|
| label | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
| features | \n", "(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | \n", "(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | \n", "(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | \n", "(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ... | \n", "(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... | \n", "
| age | \n", "59 | \n", "56 | \n", "41 | \n", "55 | \n", "54 | \n", "
| job | \n", "admin. | \n", "admin. | \n", "technician | \n", "services | \n", "admin. | \n", "
| marital | \n", "married | \n", "married | \n", "married | \n", "married | \n", "married | \n", "
| education | \n", "secondary | \n", "secondary | \n", "secondary | \n", "secondary | \n", "tertiary | \n", "
| default | \n", "no | \n", "no | \n", "no | \n", "no | \n", "no | \n", "
| balance | \n", "2343 | \n", "45 | \n", "1270 | \n", "2476 | \n", "184 | \n", "
| housing | \n", "yes | \n", "no | \n", "yes | \n", "yes | \n", "no | \n", "
| loan | \n", "no | \n", "no | \n", "no | \n", "no | \n", "no | \n", "
| contact | \n", "unknown | \n", "unknown | \n", "unknown | \n", "unknown | \n", "unknown | \n", "
| duration | \n", "1042 | \n", "1467 | \n", "1389 | \n", "579 | \n", "673 | \n", "
| campaign | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "2 | \n", "
| pdays | \n", "-1 | \n", "-1 | \n", "-1 | \n", "-1 | \n", "-1 | \n", "
| previous | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
| poutcome | \n", "unknown | \n", "unknown | \n", "unknown | \n", "unknown | \n", "unknown | \n", "
| deposit | \n", "yes | \n", "yes | \n", "yes | \n", "yes | \n", "yes | \n", "