{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## INTRO\n",
    "- Basics OP on Pyspark DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# OP \n",
    "import datetime as dt   \n",
    "import time\n",
    "import csv\n",
    "import requests\n",
    "import pandas as pd, numpy as np\n",
    "\n",
    "# SPARK \n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark import SparkConf, SparkContext\n",
    "from pyspark.sql import SQLContext, Row\n",
    "from operator import add\n",
    "from pyspark.sql.functions import countDistinct, avg, stddev, format_number"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# config \n",
    "conf = SparkConf().setAppName(\"LOAD PTT MYSQL DATABASE\")\n",
    "sc = SparkContext(conf=conf)\n",
    "sqlContext = SQLContext(sc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1) Pyspark Read csv as Spark DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the data\n",
    "df_boston = sqlContext.read\\\n",
    "                      .format('com.databricks.spark.csv')\\\n",
    "                      .options(header='true', inferschema='true')\\\n",
    "                      .load('boston.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pyspark.sql.dataframe.DataFrame"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(df_boston)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[CRIM: double, ZN: double, INDUS: double, CHAS: double, NOX: double, RM: double, AGE: double, DIS: double, RAD: double, TAX: double, PTRATIO: double, B: double, LSTAT: double, price: double]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_boston"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- CRIM: double (nullable = true)\n",
      " |-- ZN: double (nullable = true)\n",
      " |-- INDUS: double (nullable = true)\n",
      " |-- CHAS: double (nullable = true)\n",
      " |-- NOX: double (nullable = true)\n",
      " |-- RM: double (nullable = true)\n",
      " |-- AGE: double (nullable = true)\n",
      " |-- DIS: double (nullable = true)\n",
      " |-- RAD: double (nullable = true)\n",
      " |-- TAX: double (nullable = true)\n",
      " |-- PTRATIO: double (nullable = true)\n",
      " |-- B: double (nullable = true)\n",
      " |-- LSTAT: double (nullable = true)\n",
      " |-- price: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_boston.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CRIM',\n",
       " 'ZN',\n",
       " 'INDUS',\n",
       " 'CHAS',\n",
       " 'NOX',\n",
       " 'RM',\n",
       " 'AGE',\n",
       " 'DIS',\n",
       " 'RAD',\n",
       " 'TAX',\n",
       " 'PTRATIO',\n",
       " 'B',\n",
       " 'LSTAT',\n",
       " 'price']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_boston.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+\n",
      "|summary|              CRIM|                ZN|             INDUS|              CHAS|                NOX|                RM|               AGE|              DIS|              RAD|               TAX|           PTRATIO|                 B|             LSTAT|             price|\n",
      "+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+\n",
      "|  count|               506|               506|               506|               506|                506|               506|               506|              506|              506|               506|               506|               506|               506|               506|\n",
      "|   mean|3.6135235573122535|11.363636363636363|11.136778656126504|0.0691699604743083| 0.5546950592885372| 6.284634387351787| 68.57490118577078|3.795042687747034|9.549407114624506| 408.2371541501976|18.455533596837967|356.67403162055257|12.653063241106723|22.532806324110698|\n",
      "| stddev| 8.601545105332491| 23.32245299451514| 6.860352940897589|0.2539940413404101|0.11587767566755584|0.7026171434153232|28.148861406903595| 2.10571012662761|8.707259384239366|168.53711605495903|2.1649455237144455| 91.29486438415782| 7.141061511348571| 9.197104087379815|\n",
      "|    min|           0.00632|               0.0|              0.46|               0.0|              0.385|             3.561|               2.9|           1.1296|              1.0|             187.0|              12.6|              0.32|              1.73|               5.0|\n",
      "|    max|           88.9762|             100.0|             27.74|               1.0|              0.871|              8.78|             100.0|          12.1265|             24.0|             711.0|              22.0|             396.9|             37.97|              50.0|\n",
      "+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_boston.describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n",
      "|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|price|\n",
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n",
      "|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98| 24.0|\n",
      "|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14| 21.6|\n",
      "|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03| 34.7|\n",
      "|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94| 33.4|\n",
      "|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33| 36.2|\n",
      "|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21| 28.7|\n",
      "|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0|   15.2| 395.6|12.43| 22.9|\n",
      "|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0|   15.2| 396.9|19.15| 27.1|\n",
      "|0.21124|12.5| 7.87| 0.0|0.524|5.631|100.0|6.0821|5.0|311.0|   15.2|386.63|29.93| 16.5|\n",
      "|0.17004|12.5| 7.87| 0.0|0.524|6.004| 85.9|6.5921|5.0|311.0|   15.2|386.71| 17.1| 18.9|\n",
      "|0.22489|12.5| 7.87| 0.0|0.524|6.377| 94.3|6.3467|5.0|311.0|   15.2|392.52|20.45| 15.0|\n",
      "|0.11747|12.5| 7.87| 0.0|0.524|6.009| 82.9|6.2267|5.0|311.0|   15.2| 396.9|13.27| 18.9|\n",
      "|0.09378|12.5| 7.87| 0.0|0.524|5.889| 39.0|5.4509|5.0|311.0|   15.2| 390.5|15.71| 21.7|\n",
      "|0.62976| 0.0| 8.14| 0.0|0.538|5.949| 61.8|4.7075|4.0|307.0|   21.0| 396.9| 8.26| 20.4|\n",
      "|0.63796| 0.0| 8.14| 0.0|0.538|6.096| 84.5|4.4619|4.0|307.0|   21.0|380.02|10.26| 18.2|\n",
      "|0.62739| 0.0| 8.14| 0.0|0.538|5.834| 56.5|4.4986|4.0|307.0|   21.0|395.62| 8.47| 19.9|\n",
      "|1.05393| 0.0| 8.14| 0.0|0.538|5.935| 29.3|4.4986|4.0|307.0|   21.0|386.85| 6.58| 23.1|\n",
      "| 0.7842| 0.0| 8.14| 0.0|0.538| 5.99| 81.7|4.2579|4.0|307.0|   21.0|386.75|14.67| 17.5|\n",
      "|0.80271| 0.0| 8.14| 0.0|0.538|5.456| 36.6|3.7965|4.0|307.0|   21.0|288.99|11.69| 20.2|\n",
      "| 0.7258| 0.0| 8.14| 0.0|0.538|5.727| 69.5|3.7965|4.0|307.0|   21.0|390.95|11.28| 18.2|\n",
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_boston.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2) Manually set DataFrame schema"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.types import (StructField, StructType,\n",
    "                               IntegerType, StringType, LongType)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_schema = [StructField('CRIM',StringType(), True )]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_struc = StructType(fields= data_schema)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reload the csv BUT WITH PRE-DEFINED SCHEMA AS ABOVE \n",
    "df_boston_updated = sqlContext.read\\\n",
    "                      .format('com.databricks.spark.csv')\\\n",
    "                      .options(header='true', inferschema='true', shema= final_struc)\\\n",
    "                      .load('boston.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- CRIM: double (nullable = true)\n",
      " |-- ZN: double (nullable = true)\n",
      " |-- INDUS: double (nullable = true)\n",
      " |-- CHAS: double (nullable = true)\n",
      " |-- NOX: double (nullable = true)\n",
      " |-- RM: double (nullable = true)\n",
      " |-- AGE: double (nullable = true)\n",
      " |-- DIS: double (nullable = true)\n",
      " |-- RAD: double (nullable = true)\n",
      " |-- TAX: double (nullable = true)\n",
      " |-- PTRATIO: double (nullable = true)\n",
      " |-- B: double (nullable = true)\n",
      " |-- LSTAT: double (nullable = true)\n",
      " |-- price: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_boston_updated.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3) Select one column from Spark DF "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pyspark.sql.column.Column"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pandas way \n",
    "# which is not working here \n",
    "type(df_boston['CRIM'])\n",
    "# df_boston['CRIM'].show() <--- not work"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pyspark.sql.dataframe.DataFrame"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pyspark way \n",
    "type(df_boston.select('CRIM'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+\n",
      "|   CRIM|\n",
      "+-------+\n",
      "|0.00632|\n",
      "|0.02731|\n",
      "|0.02729|\n",
      "|0.03237|\n",
      "|0.06905|\n",
      "|0.02985|\n",
      "|0.08829|\n",
      "|0.14455|\n",
      "|0.21124|\n",
      "|0.17004|\n",
      "|0.22489|\n",
      "|0.11747|\n",
      "|0.09378|\n",
      "|0.62976|\n",
      "|0.63796|\n",
      "|0.62739|\n",
      "|1.05393|\n",
      "| 0.7842|\n",
      "|0.80271|\n",
      "| 0.7258|\n",
      "+-------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# SELECT 1 coluumn\n",
    "df_boston.select('CRIM').show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+\n",
      "|   CRIM|     B|\n",
      "+-------+------+\n",
      "|0.00632| 396.9|\n",
      "|0.02731| 396.9|\n",
      "|0.02729|392.83|\n",
      "|0.03237|394.63|\n",
      "|0.06905| 396.9|\n",
      "|0.02985|394.12|\n",
      "|0.08829| 395.6|\n",
      "|0.14455| 396.9|\n",
      "|0.21124|386.63|\n",
      "|0.17004|386.71|\n",
      "|0.22489|392.52|\n",
      "|0.11747| 396.9|\n",
      "|0.09378| 390.5|\n",
      "|0.62976| 396.9|\n",
      "|0.63796|380.02|\n",
      "|0.62739|395.62|\n",
      "|1.05393|386.85|\n",
      "| 0.7842|386.75|\n",
      "|0.80271|288.99|\n",
      "| 0.7258|390.95|\n",
      "+-------+------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# SELECT multiple coluumns\n",
    "df_boston.select(['CRIM','B']).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(CRIM=0.00632, ZN=18.0, INDUS=2.31, CHAS=0.0, NOX=0.538, RM=6.575, AGE=65.2, DIS=4.09, RAD=1.0, TAX=296.0, PTRATIO=15.3, B=396.9, LSTAT=4.98, price=24.0),\n",
       " Row(CRIM=0.02731, ZN=0.0, INDUS=7.07, CHAS=0.0, NOX=0.469, RM=6.421, AGE=78.9, DIS=4.9671, RAD=2.0, TAX=242.0, PTRATIO=17.8, B=396.9, LSTAT=9.14, price=21.6)]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_boston.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(CRIM=0.00632, ZN=18.0, INDUS=2.31, CHAS=0.0, NOX=0.538, RM=6.575, AGE=65.2, DIS=4.09, RAD=1.0, TAX=296.0, PTRATIO=15.3, B=396.9, LSTAT=4.98, price=24.0)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_boston.head(2)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4) Create a  new column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+\n",
      "|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|price|  colX|\n",
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+\n",
      "|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98| 24.0| 396.9|\n",
      "|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14| 21.6| 396.9|\n",
      "|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03| 34.7|392.83|\n",
      "|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94| 33.4|394.63|\n",
      "|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33| 36.2| 396.9|\n",
      "|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21| 28.7|394.12|\n",
      "|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0|   15.2| 395.6|12.43| 22.9| 395.6|\n",
      "|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0|   15.2| 396.9|19.15| 27.1| 396.9|\n",
      "|0.21124|12.5| 7.87| 0.0|0.524|5.631|100.0|6.0821|5.0|311.0|   15.2|386.63|29.93| 16.5|386.63|\n",
      "|0.17004|12.5| 7.87| 0.0|0.524|6.004| 85.9|6.5921|5.0|311.0|   15.2|386.71| 17.1| 18.9|386.71|\n",
      "|0.22489|12.5| 7.87| 0.0|0.524|6.377| 94.3|6.3467|5.0|311.0|   15.2|392.52|20.45| 15.0|392.52|\n",
      "|0.11747|12.5| 7.87| 0.0|0.524|6.009| 82.9|6.2267|5.0|311.0|   15.2| 396.9|13.27| 18.9| 396.9|\n",
      "|0.09378|12.5| 7.87| 0.0|0.524|5.889| 39.0|5.4509|5.0|311.0|   15.2| 390.5|15.71| 21.7| 390.5|\n",
      "|0.62976| 0.0| 8.14| 0.0|0.538|5.949| 61.8|4.7075|4.0|307.0|   21.0| 396.9| 8.26| 20.4| 396.9|\n",
      "|0.63796| 0.0| 8.14| 0.0|0.538|6.096| 84.5|4.4619|4.0|307.0|   21.0|380.02|10.26| 18.2|380.02|\n",
      "|0.62739| 0.0| 8.14| 0.0|0.538|5.834| 56.5|4.4986|4.0|307.0|   21.0|395.62| 8.47| 19.9|395.62|\n",
      "|1.05393| 0.0| 8.14| 0.0|0.538|5.935| 29.3|4.4986|4.0|307.0|   21.0|386.85| 6.58| 23.1|386.85|\n",
      "| 0.7842| 0.0| 8.14| 0.0|0.538| 5.99| 81.7|4.2579|4.0|307.0|   21.0|386.75|14.67| 17.5|386.75|\n",
      "|0.80271| 0.0| 8.14| 0.0|0.538|5.456| 36.6|3.7965|4.0|307.0|   21.0|288.99|11.69| 20.2|288.99|\n",
      "| 0.7258| 0.0| 8.14| 0.0|0.538|5.727| 69.5|3.7965|4.0|307.0|   21.0|390.95|11.28| 18.2|390.95|\n",
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# add new column : colX\n",
    "df_boston.withColumn('colX', df_boston['B']).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+\n",
      "|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|price|  colY|\n",
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+\n",
      "|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98| 24.0| 793.8|\n",
      "|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14| 21.6| 793.8|\n",
      "|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03| 34.7|785.66|\n",
      "|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94| 33.4|789.26|\n",
      "|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33| 36.2| 793.8|\n",
      "|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21| 28.7|788.24|\n",
      "|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0|   15.2| 395.6|12.43| 22.9| 791.2|\n",
      "|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0|   15.2| 396.9|19.15| 27.1| 793.8|\n",
      "|0.21124|12.5| 7.87| 0.0|0.524|5.631|100.0|6.0821|5.0|311.0|   15.2|386.63|29.93| 16.5|773.26|\n",
      "|0.17004|12.5| 7.87| 0.0|0.524|6.004| 85.9|6.5921|5.0|311.0|   15.2|386.71| 17.1| 18.9|773.42|\n",
      "|0.22489|12.5| 7.87| 0.0|0.524|6.377| 94.3|6.3467|5.0|311.0|   15.2|392.52|20.45| 15.0|785.04|\n",
      "|0.11747|12.5| 7.87| 0.0|0.524|6.009| 82.9|6.2267|5.0|311.0|   15.2| 396.9|13.27| 18.9| 793.8|\n",
      "|0.09378|12.5| 7.87| 0.0|0.524|5.889| 39.0|5.4509|5.0|311.0|   15.2| 390.5|15.71| 21.7| 781.0|\n",
      "|0.62976| 0.0| 8.14| 0.0|0.538|5.949| 61.8|4.7075|4.0|307.0|   21.0| 396.9| 8.26| 20.4| 793.8|\n",
      "|0.63796| 0.0| 8.14| 0.0|0.538|6.096| 84.5|4.4619|4.0|307.0|   21.0|380.02|10.26| 18.2|760.04|\n",
      "|0.62739| 0.0| 8.14| 0.0|0.538|5.834| 56.5|4.4986|4.0|307.0|   21.0|395.62| 8.47| 19.9|791.24|\n",
      "|1.05393| 0.0| 8.14| 0.0|0.538|5.935| 29.3|4.4986|4.0|307.0|   21.0|386.85| 6.58| 23.1| 773.7|\n",
      "| 0.7842| 0.0| 8.14| 0.0|0.538| 5.99| 81.7|4.2579|4.0|307.0|   21.0|386.75|14.67| 17.5| 773.5|\n",
      "|0.80271| 0.0| 8.14| 0.0|0.538|5.456| 36.6|3.7965|4.0|307.0|   21.0|288.99|11.69| 20.2|577.98|\n",
      "| 0.7258| 0.0| 8.14| 0.0|0.538|5.727| 69.5|3.7965|4.0|307.0|   21.0|390.95|11.28| 18.2| 781.9|\n",
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# add new column : colY\n",
    "df_boston.withColumn('colY', df_boston['B']*2).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5) Rename columns "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n",
      "|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|   BBB|LSTAT|price|\n",
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n",
      "|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98| 24.0|\n",
      "|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14| 21.6|\n",
      "|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03| 34.7|\n",
      "|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94| 33.4|\n",
      "|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33| 36.2|\n",
      "|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21| 28.7|\n",
      "|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0|   15.2| 395.6|12.43| 22.9|\n",
      "|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0|   15.2| 396.9|19.15| 27.1|\n",
      "|0.21124|12.5| 7.87| 0.0|0.524|5.631|100.0|6.0821|5.0|311.0|   15.2|386.63|29.93| 16.5|\n",
      "|0.17004|12.5| 7.87| 0.0|0.524|6.004| 85.9|6.5921|5.0|311.0|   15.2|386.71| 17.1| 18.9|\n",
      "|0.22489|12.5| 7.87| 0.0|0.524|6.377| 94.3|6.3467|5.0|311.0|   15.2|392.52|20.45| 15.0|\n",
      "|0.11747|12.5| 7.87| 0.0|0.524|6.009| 82.9|6.2267|5.0|311.0|   15.2| 396.9|13.27| 18.9|\n",
      "|0.09378|12.5| 7.87| 0.0|0.524|5.889| 39.0|5.4509|5.0|311.0|   15.2| 390.5|15.71| 21.7|\n",
      "|0.62976| 0.0| 8.14| 0.0|0.538|5.949| 61.8|4.7075|4.0|307.0|   21.0| 396.9| 8.26| 20.4|\n",
      "|0.63796| 0.0| 8.14| 0.0|0.538|6.096| 84.5|4.4619|4.0|307.0|   21.0|380.02|10.26| 18.2|\n",
      "|0.62739| 0.0| 8.14| 0.0|0.538|5.834| 56.5|4.4986|4.0|307.0|   21.0|395.62| 8.47| 19.9|\n",
      "|1.05393| 0.0| 8.14| 0.0|0.538|5.935| 29.3|4.4986|4.0|307.0|   21.0|386.85| 6.58| 23.1|\n",
      "| 0.7842| 0.0| 8.14| 0.0|0.538| 5.99| 81.7|4.2579|4.0|307.0|   21.0|386.75|14.67| 17.5|\n",
      "|0.80271| 0.0| 8.14| 0.0|0.538|5.456| 36.6|3.7965|4.0|307.0|   21.0|288.99|11.69| 20.2|\n",
      "| 0.7258| 0.0| 8.14| 0.0|0.538|5.727| 69.5|3.7965|4.0|307.0|   21.0|390.95|11.28| 18.2|\n",
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# rename column \"B\" -> \"BBB\"\n",
    "df_boston.withColumnRenamed('B', 'BBB').show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6) Pyspark SQL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a temp SQL view from df \n",
    "df_boston.createOrReplaceTempView('BOSTON')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+\n",
      "|     B|\n",
      "+------+\n",
      "| 396.9|\n",
      "| 396.9|\n",
      "|392.83|\n",
      "|394.63|\n",
      "| 396.9|\n",
      "|394.12|\n",
      "| 395.6|\n",
      "| 396.9|\n",
      "|386.63|\n",
      "|386.71|\n",
      "+------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "result=sqlContext.sql(\"select B from BOSTON LIMIT 10\").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7) Filter  data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n",
      "|   CRIM| ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS| RAD|  TAX|PTRATIO|    B|LSTAT|price|\n",
      "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n",
      "|51.1358|0.0| 18.1| 0.0|0.597|5.757|100.0| 1.413|24.0|666.0|   20.2|  2.6|10.11| 15.0|\n",
      "|14.0507|0.0| 18.1| 0.0|0.597|6.657|100.0|1.5275|24.0|666.0|   20.2|35.05|21.22| 17.2|\n",
      "| 18.811|0.0| 18.1| 0.0|0.597|4.628|100.0|1.5539|24.0|666.0|   20.2|28.79|34.37| 17.9|\n",
      "|18.0846|0.0| 18.1| 0.0|0.679|6.434|100.0|1.8347|24.0|666.0|   20.2|27.25|29.05|  7.2|\n",
      "|10.8342|0.0| 18.1| 0.0|0.679|6.782| 90.8|1.8195|24.0|666.0|   20.2|21.57|25.79|  7.5|\n",
      "|73.5341|0.0| 18.1| 0.0|0.679|5.957|100.0|1.8026|24.0|666.0|   20.2|16.45|20.62|  8.8|\n",
      "|11.8123|0.0| 18.1| 0.0|0.718|6.824| 76.5| 1.794|24.0|666.0|   20.2|48.45|22.74|  8.4|\n",
      "|7.05042|0.0| 18.1| 0.0|0.614|6.103| 85.1|2.0218|24.0|666.0|   20.2| 2.52|23.29| 13.4|\n",
      "|8.79212|0.0| 18.1| 0.0|0.584|5.565| 70.6|2.0635|24.0|666.0|   20.2| 3.65|17.16| 11.7|\n",
      "|15.8603|0.0| 18.1| 0.0|0.679|5.896| 95.4|1.9096|24.0|666.0|   20.2| 7.68|24.39|  8.3|\n",
      "|12.2472|0.0| 18.1| 0.0|0.584|5.837| 59.7|1.9976|24.0|666.0|   20.2|24.65|15.69| 10.2|\n",
      "|37.6619|0.0| 18.1| 0.0|0.679|6.202| 78.7|1.8629|24.0|666.0|   20.2|18.82|14.52| 10.9|\n",
      "|14.4208|0.0| 18.1| 0.0| 0.74|6.461| 93.3|2.0026|24.0|666.0|   20.2|27.49|18.05|  9.6|\n",
      "|15.1772|0.0| 18.1| 0.0| 0.74|6.152|100.0|1.9142|24.0|666.0|   20.2| 9.32|26.45|  8.7|\n",
      "|10.6718|0.0| 18.1| 0.0| 0.74|6.459| 94.8|1.9879|24.0|666.0|   20.2|43.06|23.98| 11.8|\n",
      "|6.71772|0.0| 18.1| 0.0|0.713|6.749| 92.6|2.3236|24.0|666.0|   20.2| 0.32|17.44| 13.4|\n",
      "|9.51363|0.0| 18.1| 0.0|0.713|6.728| 94.1|2.4961|24.0|666.0|   20.2| 6.68|18.71| 14.9|\n",
      "|4.66883|0.0| 18.1| 0.0|0.713|5.976| 87.9|2.5806|24.0|666.0|   20.2|10.48|19.01| 12.7|\n",
      "|8.20058|0.0| 18.1| 0.0|0.713|5.936| 80.3|2.7792|24.0|666.0|   20.2|  3.5|16.94| 13.5|\n",
      "|3.77498|0.0| 18.1| 0.0|0.655|5.952| 84.7|2.8715|24.0|666.0|   20.2|22.01|17.15| 19.0|\n",
      "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "result2=df_boston.filter(\"B < 50\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+\n",
      "|    B|\n",
      "+-----+\n",
      "|  2.6|\n",
      "|35.05|\n",
      "|28.79|\n",
      "|27.25|\n",
      "|21.57|\n",
      "|16.45|\n",
      "|48.45|\n",
      "| 2.52|\n",
      "| 3.65|\n",
      "| 7.68|\n",
      "|24.65|\n",
      "|18.82|\n",
      "|27.49|\n",
      "| 9.32|\n",
      "|43.06|\n",
      "| 0.32|\n",
      "| 6.68|\n",
      "|10.48|\n",
      "|  3.5|\n",
      "|22.01|\n",
      "+-----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "result2=df_boston.filter(\"B < 50\").select(['B']).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n",
      "|   CRIM| ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS| RAD|  TAX|PTRATIO|    B|LSTAT|price|\n",
      "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n",
      "|51.1358|0.0| 18.1| 0.0|0.597|5.757|100.0| 1.413|24.0|666.0|   20.2|  2.6|10.11| 15.0|\n",
      "|14.0507|0.0| 18.1| 0.0|0.597|6.657|100.0|1.5275|24.0|666.0|   20.2|35.05|21.22| 17.2|\n",
      "| 18.811|0.0| 18.1| 0.0|0.597|4.628|100.0|1.5539|24.0|666.0|   20.2|28.79|34.37| 17.9|\n",
      "|18.0846|0.0| 18.1| 0.0|0.679|6.434|100.0|1.8347|24.0|666.0|   20.2|27.25|29.05|  7.2|\n",
      "|10.8342|0.0| 18.1| 0.0|0.679|6.782| 90.8|1.8195|24.0|666.0|   20.2|21.57|25.79|  7.5|\n",
      "|73.5341|0.0| 18.1| 0.0|0.679|5.957|100.0|1.8026|24.0|666.0|   20.2|16.45|20.62|  8.8|\n",
      "|11.8123|0.0| 18.1| 0.0|0.718|6.824| 76.5| 1.794|24.0|666.0|   20.2|48.45|22.74|  8.4|\n",
      "|7.05042|0.0| 18.1| 0.0|0.614|6.103| 85.1|2.0218|24.0|666.0|   20.2| 2.52|23.29| 13.4|\n",
      "|8.79212|0.0| 18.1| 0.0|0.584|5.565| 70.6|2.0635|24.0|666.0|   20.2| 3.65|17.16| 11.7|\n",
      "|15.8603|0.0| 18.1| 0.0|0.679|5.896| 95.4|1.9096|24.0|666.0|   20.2| 7.68|24.39|  8.3|\n",
      "|12.2472|0.0| 18.1| 0.0|0.584|5.837| 59.7|1.9976|24.0|666.0|   20.2|24.65|15.69| 10.2|\n",
      "|37.6619|0.0| 18.1| 0.0|0.679|6.202| 78.7|1.8629|24.0|666.0|   20.2|18.82|14.52| 10.9|\n",
      "|14.4208|0.0| 18.1| 0.0| 0.74|6.461| 93.3|2.0026|24.0|666.0|   20.2|27.49|18.05|  9.6|\n",
      "|15.1772|0.0| 18.1| 0.0| 0.74|6.152|100.0|1.9142|24.0|666.0|   20.2| 9.32|26.45|  8.7|\n",
      "|10.6718|0.0| 18.1| 0.0| 0.74|6.459| 94.8|1.9879|24.0|666.0|   20.2|43.06|23.98| 11.8|\n",
      "|6.71772|0.0| 18.1| 0.0|0.713|6.749| 92.6|2.3236|24.0|666.0|   20.2| 0.32|17.44| 13.4|\n",
      "|9.51363|0.0| 18.1| 0.0|0.713|6.728| 94.1|2.4961|24.0|666.0|   20.2| 6.68|18.71| 14.9|\n",
      "|4.66883|0.0| 18.1| 0.0|0.713|5.976| 87.9|2.5806|24.0|666.0|   20.2|10.48|19.01| 12.7|\n",
      "|8.20058|0.0| 18.1| 0.0|0.713|5.936| 80.3|2.7792|24.0|666.0|   20.2|  3.5|16.94| 13.5|\n",
      "|3.77498|0.0| 18.1| 0.0|0.655|5.952| 84.7|2.8715|24.0|666.0|   20.2|22.01|17.15| 19.0|\n",
      "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# python way \n",
    "result2=df_boston.filter(df_boston['B']<50).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n",
      "|   CRIM| ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS| RAD|  TAX|PTRATIO|    B|LSTAT|price|\n",
      "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n",
      "|14.0507|0.0| 18.1| 0.0|0.597|6.657|100.0|1.5275|24.0|666.0|   20.2|35.05|21.22| 17.2|\n",
      "| 18.811|0.0| 18.1| 0.0|0.597|4.628|100.0|1.5539|24.0|666.0|   20.2|28.79|34.37| 17.9|\n",
      "|18.0846|0.0| 18.1| 0.0|0.679|6.434|100.0|1.8347|24.0|666.0|   20.2|27.25|29.05|  7.2|\n",
      "|10.8342|0.0| 18.1| 0.0|0.679|6.782| 90.8|1.8195|24.0|666.0|   20.2|21.57|25.79|  7.5|\n",
      "|73.5341|0.0| 18.1| 0.0|0.679|5.957|100.0|1.8026|24.0|666.0|   20.2|16.45|20.62|  8.8|\n",
      "|11.8123|0.0| 18.1| 0.0|0.718|6.824| 76.5| 1.794|24.0|666.0|   20.2|48.45|22.74|  8.4|\n",
      "|12.2472|0.0| 18.1| 0.0|0.584|5.837| 59.7|1.9976|24.0|666.0|   20.2|24.65|15.69| 10.2|\n",
      "|37.6619|0.0| 18.1| 0.0|0.679|6.202| 78.7|1.8629|24.0|666.0|   20.2|18.82|14.52| 10.9|\n",
      "|14.4208|0.0| 18.1| 0.0| 0.74|6.461| 93.3|2.0026|24.0|666.0|   20.2|27.49|18.05|  9.6|\n",
      "|10.6718|0.0| 18.1| 0.0| 0.74|6.459| 94.8|1.9879|24.0|666.0|   20.2|43.06|23.98| 11.8|\n",
      "|4.66883|0.0| 18.1| 0.0|0.713|5.976| 87.9|2.5806|24.0|666.0|   20.2|10.48|19.01| 12.7|\n",
      "|3.77498|0.0| 18.1| 0.0|0.655|5.952| 84.7|2.8715|24.0|666.0|   20.2|22.01|17.15| 19.0|\n",
      "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#####  filter in multiple conditions #####\n",
    "\n",
    "df_boston.filter((df_boston['B'] < 50 )  & (df_boston['B'] > 10 )  ).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8) Get filter spark data as dict, array.. and process on them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "filer_result=df_boston.filter((df_boston['B'] < 50 )  & (df_boston['B'] > 10 )  ).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(CRIM=14.0507, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.597, RM=6.657, AGE=100.0, DIS=1.5275, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=35.05, LSTAT=21.22, price=17.2),\n",
       " Row(CRIM=18.811, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.597, RM=4.628, AGE=100.0, DIS=1.5539, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=28.79, LSTAT=34.37, price=17.9),\n",
       " Row(CRIM=18.0846, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.679, RM=6.434, AGE=100.0, DIS=1.8347, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=27.25, LSTAT=29.05, price=7.2),\n",
       " Row(CRIM=10.8342, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.679, RM=6.782, AGE=90.8, DIS=1.8195, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=21.57, LSTAT=25.79, price=7.5),\n",
       " Row(CRIM=73.5341, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.679, RM=5.957, AGE=100.0, DIS=1.8026, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=16.45, LSTAT=20.62, price=8.8),\n",
       " Row(CRIM=11.8123, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.718, RM=6.824, AGE=76.5, DIS=1.794, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=48.45, LSTAT=22.74, price=8.4),\n",
       " Row(CRIM=12.2472, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.584, RM=5.837, AGE=59.7, DIS=1.9976, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=24.65, LSTAT=15.69, price=10.2),\n",
       " Row(CRIM=37.6619, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.679, RM=6.202, AGE=78.7, DIS=1.8629, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=18.82, LSTAT=14.52, price=10.9),\n",
       " Row(CRIM=14.4208, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.74, RM=6.461, AGE=93.3, DIS=2.0026, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=27.49, LSTAT=18.05, price=9.6),\n",
       " Row(CRIM=10.6718, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.74, RM=6.459, AGE=94.8, DIS=1.9879, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=43.06, LSTAT=23.98, price=11.8),\n",
       " Row(CRIM=4.66883, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.713, RM=5.976, AGE=87.9, DIS=2.5806, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=10.48, LSTAT=19.01, price=12.7),\n",
       " Row(CRIM=3.77498, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.655, RM=5.952, AGE=84.7, DIS=2.8715, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=22.01, LSTAT=17.15, price=19.0)]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filer_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "list"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(filer_result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(CRIM=14.0507, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.597, RM=6.657, AGE=100.0, DIS=1.5275, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=35.05, LSTAT=21.22, price=17.2)"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filer_result[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'AGE': 100.0,\n",
       " 'B': 35.05,\n",
       " 'CHAS': 0.0,\n",
       " 'CRIM': 14.0507,\n",
       " 'DIS': 1.5275,\n",
       " 'INDUS': 18.1,\n",
       " 'LSTAT': 21.22,\n",
       " 'NOX': 0.597,\n",
       " 'PTRATIO': 20.2,\n",
       " 'RAD': 24.0,\n",
       " 'RM': 6.657,\n",
       " 'TAX': 666.0,\n",
       " 'ZN': 0.0,\n",
       " 'price': 17.2}"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filer_result[0].asDict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "100.0"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filer_result[0].asDict()['AGE']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# END OF COURSE 8.26 \n",
    "# next 8.27"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9) Spark df groupby "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pyspark.sql.group.GroupedData at 0x109f03550>"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_boston.groupby(\"RAD\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[RAD: double, avg(CRIM): double, avg(ZN): double, avg(INDUS): double, avg(CHAS): double, avg(NOX): double, avg(RM): double, avg(AGE): double, avg(DIS): double, avg(RAD): double, avg(TAX): double, avg(PTRATIO): double, avg(B): double, avg(LSTAT): double, avg(price): double]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_boston.groupby(\"RAD\").mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+--------------------+------------------+------------------+-------------------+-------------------+------------------+-----------------+------------------+--------+------------------+------------------+------------------+------------------+------------------+\n",
      "| RAD|           avg(CRIM)|           avg(ZN)|        avg(INDUS)|          avg(CHAS)|           avg(NOX)|           avg(RM)|         avg(AGE)|          avg(DIS)|avg(RAD)|          avg(TAX)|      avg(PTRATIO)|            avg(B)|        avg(LSTAT)|        avg(price)|\n",
      "+----+--------------------+------------------+------------------+-------------------+-------------------+------------------+-----------------+------------------+--------+------------------+------------------+------------------+------------------+------------------+\n",
      "| 8.0|  0.3714095833333333|              6.25|            5.9325|0.20833333333333334|0.49249999999999977|6.9507916666666665|            67.35| 4.410604166666666|     8.0|            301.25| 17.97499999999999|          385.2775|7.9608333333333325|30.358333333333334|\n",
      "| 7.0| 0.15039941176470584|26.705882352941178| 5.034705882352943|                0.0| 0.4409999999999999| 6.647764705882355|40.14117647058823| 6.495858823529412|     7.0| 304.4117647058824| 18.40588235294117| 388.4329411764706|  7.98823529411765|27.105882352941173|\n",
      "| 1.0|0.036028500000000005|              39.9| 5.066000000000001|               0.05| 0.4628900000000001| 6.555850000000001|           45.025| 6.027824999999999|     1.0|            291.45|17.564999999999998|389.27299999999997| 7.369999999999999|24.364999999999995|\n",
      "| 4.0| 0.39390345454545467|14.731818181818182|10.746636363636362|0.07272727272727272| 0.5043109090909086|6.1331181818181815|60.84181818181818|             4.433|     4.0| 335.9818181818182|19.136363636363644|  382.721272727273|12.199090909090906| 21.38727272727272|\n",
      "| 3.0| 0.09735947368421054| 16.38157894736842|  4.41894736842105|0.05263157894736842| 0.4524236842105263| 6.473026315789473|49.31052631578947| 5.146492105263157|     3.0|246.07894736842104|18.168421052631583|  392.417894736842| 9.076052631578946|27.928947368421053|\n",
      "| 2.0| 0.08328541666666667|20.416666666666668| 9.630833333333337|                0.0|0.48491666666666644| 6.649958333333335|64.77083333333333| 4.097008333333332|     2.0| 260.6666666666667| 17.29166666666667|         386.41375|10.024583333333332| 26.83333333333333|\n",
      "| 6.0| 0.15005384615384618| 12.98076923076923| 8.198076923076924|                0.0| 0.5148461538461541| 6.104730769230769|60.14230769230768| 4.024915384615384|     6.0|372.88461538461536|17.815384615384616| 387.3665384615384|12.305769230769231|20.976923076923075|\n",
      "| 5.0|  0.6877916521739129|11.108695652173912| 9.759826086956538|0.09565217391304348| 0.5708834782608687|  6.39315652173913|69.22347826086958|3.6973286956521743|     5.0| 331.8434782608696| 16.54695652173914|369.19243478260887|10.657217391304345| 25.70695652173912|\n",
      "|24.0|  12.759290909090915|               0.0|18.099999999999955|0.06060606060606061| 0.6724166666666664| 6.022151515151517|89.80530303030304|2.0612537878787878|    24.0|             666.0|20.199999999999985|288.08916666666687|18.600606060606058| 16.40378787878788|\n",
      "+----+--------------------+------------------+------------------+-------------------+-------------------+------------------+-----------------+------------------+--------+------------------+------------------+------------------+------------------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_boston.groupby(\"RAD\").mean().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------------+\n",
      "|         avg(RAD)|\n",
      "+-----------------+\n",
      "|9.549407114624506|\n",
      "+-----------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# agg \n",
    "df_boston.agg({'RAD':'mean'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+-------+\n",
      "| RAD|max(ZN)|\n",
      "+----+-------+\n",
      "| 8.0|   25.0|\n",
      "| 7.0|   34.0|\n",
      "| 1.0|   90.0|\n",
      "| 4.0|   95.0|\n",
      "| 3.0|   95.0|\n",
      "| 2.0|   85.0|\n",
      "| 6.0|   52.5|\n",
      "| 5.0|  100.0|\n",
      "|24.0|    0.0|\n",
      "+----+-------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# group + agg  \n",
    "group_data = df_boston.groupby('RAD')\n",
    "group_data.agg({'ZN': 'max'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+--------+\n",
      "| RAD|max(NOX)|\n",
      "+----+--------+\n",
      "| 8.0|   0.507|\n",
      "| 7.0|   0.472|\n",
      "| 1.0|   0.573|\n",
      "| 4.0|   0.624|\n",
      "| 3.0|   0.488|\n",
      "| 2.0|   0.581|\n",
      "| 6.0|   0.585|\n",
      "| 5.0|   0.871|\n",
      "|24.0|    0.77|\n",
      "+----+--------+\n",
      "\n",
      "+----+-------+\n",
      "| RAD|max(RM)|\n",
      "+----+-------+\n",
      "| 8.0|  8.725|\n",
      "| 7.0|  8.259|\n",
      "| 1.0|  7.923|\n",
      "| 4.0|  8.034|\n",
      "| 3.0|  7.831|\n",
      "| 2.0|  8.069|\n",
      "| 6.0|  6.897|\n",
      "| 5.0|  8.704|\n",
      "|24.0|   8.78|\n",
      "+----+-------+\n",
      "\n",
      "+----+--------+\n",
      "| RAD|max(AGE)|\n",
      "+----+--------+\n",
      "| 8.0|    93.4|\n",
      "| 7.0|    79.2|\n",
      "| 1.0|    91.0|\n",
      "| 4.0|   100.0|\n",
      "| 3.0|    95.6|\n",
      "| 2.0|    97.0|\n",
      "| 6.0|    95.4|\n",
      "| 5.0|   100.0|\n",
      "|24.0|   100.0|\n",
      "+----+--------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# group + agg for loop  -- general case \n",
    "\n",
    "group_data = df_boston.groupby('RAD')\n",
    "for col in ['NOX','RM','AGE']:\n",
    "    group_data.agg({col: 'max'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|count(DISTINCT CRIM)|\n",
      "+--------------------+\n",
      "|                 504|\n",
      "+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# import spark default func  :  countDistinct, avg, stddev\n",
    "\n",
    "df_boston.select(countDistinct('CRIM')).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+\n",
      "|         avg(CRIM)|\n",
      "+------------------+\n",
      "|3.6135235573122535|\n",
      "+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_boston.select(avg('CRIM')).show()\n",
    "\n",
    "# compare with the groupby method \n",
    "# df_boston.agg({'CRIM':'mean'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+\n",
      "|AVG_CRIM|\n",
      "+--------+\n",
      "|     504|\n",
      "+--------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# rename the result col \n",
    "df_boston.select(countDistinct('CRIM').alias('AVG_CRIM')).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------------+\n",
      "|              std|\n",
      "+-----------------+\n",
      "|8.601545105332491|\n",
      "+-----------------+\n",
      "\n",
      "None\n",
      "+---------------------+\n",
      "|format_number(std, 2)|\n",
      "+---------------------+\n",
      "|                 8.60|\n",
      "+---------------------+\n",
      "\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "# format  result digits \n",
    "\n",
    "crim_std = df_boston.select(stddev('CRIM').alias('std'))\n",
    "\n",
    "print (crim_std.show())\n",
    "\n",
    "print (crim_std.select(format_number('std',2)).show())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----+-----+----+------+-----+----+-------+---+-----+-------+------+-----+-----+\n",
      "|   CRIM|   ZN|INDUS|CHAS|   NOX|   RM| AGE|    DIS|RAD|  TAX|PTRATIO|     B|LSTAT|price|\n",
      "+-------+-----+-----+----+------+-----+----+-------+---+-----+-------+------+-----+-----+\n",
      "|0.00632| 18.0| 2.31| 0.0| 0.538|6.575|65.2|   4.09|1.0|296.0|   15.3| 396.9| 4.98| 24.0|\n",
      "|0.00906| 90.0| 2.97| 0.0|   0.4|7.088|20.8| 7.3073|1.0|285.0|   15.3|394.72| 7.85| 32.2|\n",
      "|0.01096| 55.0| 2.25| 0.0| 0.389|6.453|31.9| 7.3073|1.0|300.0|   15.3|394.72| 8.23| 22.0|\n",
      "|0.01301| 35.0| 1.52| 0.0| 0.442|7.241|49.3| 7.0379|1.0|284.0|   15.5|394.74| 5.49| 32.7|\n",
      "|0.01311| 90.0| 1.22| 0.0| 0.403|7.249|21.9| 8.6966|5.0|226.0|   17.9|395.93| 4.81| 35.4|\n",
      "| 0.0136| 75.0|  4.0| 0.0|  0.41|5.888|47.6| 7.3197|3.0|469.0|   21.1| 396.9| 14.8| 18.9|\n",
      "|0.01381| 80.0| 0.46| 0.0| 0.422|7.875|32.0| 5.6484|4.0|255.0|   14.4|394.23| 2.97| 50.0|\n",
      "|0.01432|100.0| 1.32| 0.0| 0.411|6.816|40.5| 8.3248|5.0|256.0|   15.1| 392.9| 3.95| 31.6|\n",
      "|0.01439| 60.0| 2.93| 0.0| 0.401|6.604|18.8| 6.2196|1.0|265.0|   15.6| 376.7| 4.38| 29.1|\n",
      "|0.01501| 90.0| 1.21| 1.0| 0.401|7.923|24.8|  5.885|1.0|198.0|   13.6|395.52| 3.16| 50.0|\n",
      "|0.01501| 80.0| 2.01| 0.0| 0.435|6.635|29.7|  8.344|4.0|280.0|   17.0|390.94| 5.99| 24.5|\n",
      "|0.01538| 90.0| 3.75| 0.0| 0.394|7.454|34.2| 6.3361|3.0|244.0|   15.9|386.34| 3.11| 44.0|\n",
      "|0.01709| 90.0| 2.02| 0.0|  0.41|6.728|36.1|12.1265|5.0|187.0|   17.0|384.46|  4.5| 30.1|\n",
      "|0.01778| 95.0| 1.47| 0.0| 0.403|7.135|13.9| 7.6534|3.0|402.0|   17.0| 384.3| 4.45| 32.9|\n",
      "| 0.0187| 85.0| 4.15| 0.0| 0.429|6.516|27.7| 8.5353|4.0|351.0|   17.9|392.43| 6.36| 23.1|\n",
      "|0.01951| 17.5| 1.38| 0.0|0.4161|7.104|59.5| 9.2229|3.0|216.0|   18.6|393.24| 8.05| 33.0|\n",
      "|0.01965| 80.0| 1.76| 0.0| 0.385| 6.23|31.5| 9.0892|1.0|241.0|   18.2| 341.6|12.93| 20.1|\n",
      "|0.02009| 95.0| 2.68| 0.0|0.4161|8.034|31.9|  5.118|4.0|224.0|   14.7|390.55| 2.88| 50.0|\n",
      "|0.02055| 85.0| 0.74| 0.0|  0.41|6.383|35.7| 9.1876|2.0|313.0|   17.3| 396.9| 5.77| 24.7|\n",
      "|0.02177| 82.5| 2.03| 0.0| 0.415| 7.61|15.7|   6.27|2.0|348.0|   14.7|395.38| 3.11| 42.3|\n",
      "+-------+-----+-----+----+------+-----+----+-------+---+-----+-------+------+-----+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# order by \n",
    "\n",
    "df_boston.orderBy('CRIM').show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+------+-----+-----+\n",
      "|   CRIM| ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS| RAD|  TAX|PTRATIO|     B|LSTAT|price|\n",
      "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+------+-----+-----+\n",
      "|88.9762|0.0| 18.1| 0.0|0.671|6.968| 91.9|1.4165|24.0|666.0|   20.2| 396.9|17.21| 10.4|\n",
      "|73.5341|0.0| 18.1| 0.0|0.679|5.957|100.0|1.8026|24.0|666.0|   20.2| 16.45|20.62|  8.8|\n",
      "|67.9208|0.0| 18.1| 0.0|0.693|5.683|100.0|1.4254|24.0|666.0|   20.2|384.97|22.98|  5.0|\n",
      "|51.1358|0.0| 18.1| 0.0|0.597|5.757|100.0| 1.413|24.0|666.0|   20.2|   2.6|10.11| 15.0|\n",
      "|45.7461|0.0| 18.1| 0.0|0.693|4.519|100.0|1.6582|24.0|666.0|   20.2| 88.27|36.98|  7.0|\n",
      "|41.5292|0.0| 18.1| 0.0|0.693|5.531| 85.4|1.6074|24.0|666.0|   20.2|329.46|27.38|  8.5|\n",
      "|38.3518|0.0| 18.1| 0.0|0.693|5.453|100.0|1.4896|24.0|666.0|   20.2| 396.9|30.59|  5.0|\n",
      "|37.6619|0.0| 18.1| 0.0|0.679|6.202| 78.7|1.8629|24.0|666.0|   20.2| 18.82|14.52| 10.9|\n",
      "|28.6558|0.0| 18.1| 0.0|0.597|5.155|100.0|1.5894|24.0|666.0|   20.2|210.97|20.08| 16.3|\n",
      "|25.9406|0.0| 18.1| 0.0|0.679|5.304| 89.1|1.6475|24.0|666.0|   20.2|127.36|26.64| 10.4|\n",
      "|25.0461|0.0| 18.1| 0.0|0.693|5.987|100.0|1.5888|24.0|666.0|   20.2| 396.9|26.77|  5.6|\n",
      "|24.8017|0.0| 18.1| 0.0|0.693|5.349| 96.0|1.7028|24.0|666.0|   20.2| 396.9|19.77|  8.3|\n",
      "|24.3938|0.0| 18.1| 0.0|  0.7|4.652|100.0|1.4672|24.0|666.0|   20.2| 396.9|28.28| 10.5|\n",
      "|23.6482|0.0| 18.1| 0.0|0.671| 6.38| 96.2|1.3861|24.0|666.0|   20.2| 396.9|23.69| 13.1|\n",
      "|22.5971|0.0| 18.1| 0.0|  0.7|  5.0| 89.5|1.5184|24.0|666.0|   20.2| 396.9|31.99|  7.4|\n",
      "|22.0511|0.0| 18.1| 0.0| 0.74|5.818| 92.4|1.8662|24.0|666.0|   20.2|391.45|22.11| 10.5|\n",
      "|20.7162|0.0| 18.1| 0.0|0.659|4.138|100.0|1.1781|24.0|666.0|   20.2|370.22|23.34| 11.9|\n",
      "|20.0849|0.0| 18.1| 0.0|  0.7|4.368| 91.2|1.4395|24.0|666.0|   20.2|285.83|30.63|  8.8|\n",
      "|19.6091|0.0| 18.1| 0.0|0.671|7.313| 97.9|1.3163|24.0|666.0|   20.2| 396.9|13.44| 15.0|\n",
      "| 18.811|0.0| 18.1| 0.0|0.597|4.628|100.0|1.5539|24.0|666.0|   20.2| 28.79|34.37| 17.9|\n",
      "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+------+-----+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# order by : inverse order \n",
    "\n",
    "df_boston.orderBy(df_boston['CRIM'].desc()).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10) Missing data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n",
      "|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|price|\n",
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n",
      "|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98| 24.0|\n",
      "|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14| 21.6|\n",
      "|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03| 34.7|\n",
      "|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94| 33.4|\n",
      "|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33| 36.2|\n",
      "|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21| 28.7|\n",
      "|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0|   15.2| 395.6|12.43| 22.9|\n",
      "|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0|   15.2| 396.9|19.15| 27.1|\n",
      "|0.21124|12.5| 7.87| 0.0|0.524|5.631|100.0|6.0821|5.0|311.0|   15.2|386.63|29.93| 16.5|\n",
      "|0.17004|12.5| 7.87| 0.0|0.524|6.004| 85.9|6.5921|5.0|311.0|   15.2|386.71| 17.1| 18.9|\n",
      "|0.22489|12.5| 7.87| 0.0|0.524|6.377| 94.3|6.3467|5.0|311.0|   15.2|392.52|20.45| 15.0|\n",
      "|0.11747|12.5| 7.87| 0.0|0.524|6.009| 82.9|6.2267|5.0|311.0|   15.2| 396.9|13.27| 18.9|\n",
      "|0.09378|12.5| 7.87| 0.0|0.524|5.889| 39.0|5.4509|5.0|311.0|   15.2| 390.5|15.71| 21.7|\n",
      "|0.62976| 0.0| 8.14| 0.0|0.538|5.949| 61.8|4.7075|4.0|307.0|   21.0| 396.9| 8.26| 20.4|\n",
      "|0.63796| 0.0| 8.14| 0.0|0.538|6.096| 84.5|4.4619|4.0|307.0|   21.0|380.02|10.26| 18.2|\n",
      "|0.62739| 0.0| 8.14| 0.0|0.538|5.834| 56.5|4.4986|4.0|307.0|   21.0|395.62| 8.47| 19.9|\n",
      "|1.05393| 0.0| 8.14| 0.0|0.538|5.935| 29.3|4.4986|4.0|307.0|   21.0|386.85| 6.58| 23.1|\n",
      "| 0.7842| 0.0| 8.14| 0.0|0.538| 5.99| 81.7|4.2579|4.0|307.0|   21.0|386.75|14.67| 17.5|\n",
      "|0.80271| 0.0| 8.14| 0.0|0.538|5.456| 36.6|3.7965|4.0|307.0|   21.0|288.99|11.69| 20.2|\n",
      "| 0.7258| 0.0| 8.14| 0.0|0.538|5.727| 69.5|3.7965|4.0|307.0|   21.0|390.95|11.28| 18.2|\n",
      "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# drop null data \n",
    "df_boston.na.drop().show()\n",
    "\n",
    "# only drop when all column data is null \n",
    "#df_boston.na.drop(how='all').show()\n",
    "\n",
    "# only drop if specfic column data is null \n",
    "#df_boston.na.drop(subset = ['CRIM']).show()\n",
    "\n",
    "\n",
    "# fill null data \n",
    "#df_boston.na.fill('FILLED VALUE').show()\n",
    "\n",
    "\n",
    "# fill with avg value\n",
    "# from pyspark.sql.functions import mean \n",
    "# mean_val = df_boston.select(mean(df['CRIM'])).collect()\n",
    "# mean_val_ = mean_val[0][0]\n",
    "#df_boston.na.fill(mean_val_, ['CRIM']).show()\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 11) Dates and Timestamp "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load timestamp csv \n",
    "\n",
    "# load the data\n",
    "df_timestamp = sqlContext.read\\\n",
    "                      .format('com.databricks.spark.csv')\\\n",
    "                      .options(header='true', inferschema='true')\\\n",
    "                      .load('df_test.csv')\n",
    "            \n",
    "# rename column \"_c0\" -> \"time\"\n",
    "df_timestamp = df_timestamp.withColumnRenamed('_c0', 'time')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------+---------------+---------------+---------------+-----------------+\n",
      "|               time|              A|              B|              C|                D|\n",
      "+-------------------+---------------+---------------+---------------+-----------------+\n",
      "|2013-01-01 00:00:00|  1.74284507345| 0.827612028809|-0.806632282957|   0.306776368559|\n",
      "|2013-01-02 00:00:00|  1.33922469984|-0.795165583739| -1.77473024596|   0.437705419137|\n",
      "|2013-01-03 00:00:00| 0.652848767373| -1.54077088745| -1.41021709968|    1.70035456582|\n",
      "|2013-01-04 00:00:00| 0.247396585844| -1.30234054482|-0.846092769624|   0.699764788482|\n",
      "|2013-01-05 00:00:00|-0.740530243845| 0.713124246753| -1.16320459805|-0.00284413632768|\n",
      "|2013-01-06 00:00:00| -1.17173754177| 0.560340996287| 0.946558700023|  -0.615189047933|\n",
      "+-------------------+---------------+---------------+---------------+-----------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_timestamp.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import timestamp OP methods \n",
    "from pyspark.sql.functions import (dayofmonth,\n",
    "                                   hour,\n",
    "                                   dayofyear,\n",
    "                                   month, \n",
    "                                   year,\n",
    "                                   weekofyear,\n",
    "                                   format_number,\n",
    "                                   date_format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----------------+-----------+----------+-------------------+\n",
      "|dayofmonth(time)|month(time)|year(time)|               time|\n",
      "+----------------+-----------+----------+-------------------+\n",
      "|               1|          1|      2013|2013-01-01 00:00:00|\n",
      "|               2|          1|      2013|2013-01-02 00:00:00|\n",
      "|               3|          1|      2013|2013-01-03 00:00:00|\n",
      "|               4|          1|      2013|2013-01-04 00:00:00|\n",
      "|               5|          1|      2013|2013-01-05 00:00:00|\n",
      "|               6|          1|      2013|2013-01-06 00:00:00|\n",
      "+----------------+-----------+----------+-------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# transform timestemp via timestamp OP methods \n",
    "\n",
    "df_timestamp.select(dayofmonth(df_timestamp['time']),\n",
    "                    month(df_timestamp['time']),\n",
    "                    year(df_timestamp['time']),\n",
    "                    (df_timestamp['time'])).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+------------------+--------------+-------------------+------------------+---------+\n",
      "|year|            avg(A)|        avg(B)|             avg(C)|            avg(D)|avg(year)|\n",
      "+----+------------------+--------------+-------------------+------------------+---------+\n",
      "|2013|0.3450078901486666|-0.25619995736|-0.8423863827079999|0.4210946596228866|   2013.0|\n",
      "+----+------------------+--------------+-------------------+------------------+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# get avg value per year \n",
    "df_timestamp_new =  df_timestamp.withColumn('year', year(df_timestamp['time']))\n",
    "df_timestamp_new.groupby('year').mean().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+------------------+\n",
      "|year|            avg(A)|\n",
      "+----+------------------+\n",
      "|2013|0.3450078901486666|\n",
      "+----+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# only select the needed columns \n",
    "df_timestamp_new.groupby('year').mean().select(['year','avg(A)']).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---------------------------------+\n",
      "|format_number(Average A value, 2)|\n",
      "+---------------------------------+\n",
      "|                             0.35|\n",
      "+---------------------------------+\n",
      "\n",
      "+----+------------------+\n",
      "|year|   Average A value|\n",
      "+----+------------------+\n",
      "|2013|0.3450078901486666|\n",
      "+----+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# format the outcome \n",
    "result = df_timestamp_new.groupby('year').mean().select(['year','avg(A)'])\n",
    "new_result = result.withColumnRenamed(\"avg(A)\", \"Average A value\")\n",
    "new_result.select(format_number(\"Average A value\", 2)).alias('Average A value').show()\n",
    "\n",
    "\n",
    "new_result.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "# end of Section 8 \n",
    "# next : Section 9 "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Environment (conda_pyspark_)",
   "language": "python",
   "name": "conda_pyspark_"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}