{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## INTRO\n", "- Basics OP on Pyspark DataFrame" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# OP \n", "import datetime as dt \n", "import time\n", "import csv\n", "import requests\n", "import pandas as pd, numpy as np\n", "\n", "# SPARK \n", "from pyspark.sql import SparkSession\n", "from pyspark import SparkConf, SparkContext\n", "from pyspark.sql import SQLContext, Row\n", "from operator import add\n", "from pyspark.sql.functions import countDistinct, avg, stddev, format_number" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# config \n", "conf = SparkConf().setAppName(\"LOAD PTT MYSQL DATABASE\")\n", "sc = SparkContext(conf=conf)\n", "sqlContext = SQLContext(sc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1) Pyspark Read csv as Spark DataFrame" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# load the data\n", "df_boston = sqlContext.read\\\n", " .format('com.databricks.spark.csv')\\\n", " .options(header='true', inferschema='true')\\\n", " .load('boston.csv')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pyspark.sql.dataframe.DataFrame" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(df_boston)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DataFrame[CRIM: double, ZN: double, INDUS: double, CHAS: double, NOX: double, RM: double, AGE: double, DIS: double, RAD: double, TAX: double, PTRATIO: double, B: double, LSTAT: double, price: double]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_boston" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- CRIM: double (nullable = true)\n", " |-- ZN: double (nullable = true)\n", " |-- INDUS: double (nullable = true)\n", " |-- CHAS: double (nullable = true)\n", " |-- NOX: double (nullable = true)\n", " |-- RM: double (nullable = true)\n", " |-- AGE: double (nullable = true)\n", " |-- DIS: double (nullable = true)\n", " |-- RAD: double (nullable = true)\n", " |-- TAX: double (nullable = true)\n", " |-- PTRATIO: double (nullable = true)\n", " |-- B: double (nullable = true)\n", " |-- LSTAT: double (nullable = true)\n", " |-- price: double (nullable = true)\n", "\n" ] } ], "source": [ "df_boston.printSchema()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['CRIM',\n", " 'ZN',\n", " 'INDUS',\n", " 'CHAS',\n", " 'NOX',\n", " 'RM',\n", " 'AGE',\n", " 'DIS',\n", " 'RAD',\n", " 'TAX',\n", " 'PTRATIO',\n", " 'B',\n", " 'LSTAT',\n", " 'price']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_boston.columns" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+\n", "|summary| CRIM| ZN| INDUS| CHAS| NOX| RM| AGE| DIS| RAD| TAX| PTRATIO| B| LSTAT| price|\n", "+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+\n", "| count| 506| 506| 506| 506| 506| 506| 506| 506| 506| 506| 506| 506| 506| 506|\n", "| mean|3.6135235573122535|11.363636363636363|11.136778656126504|0.0691699604743083| 0.5546950592885372| 6.284634387351787| 68.57490118577078|3.795042687747034|9.549407114624506| 408.2371541501976|18.455533596837967|356.67403162055257|12.653063241106723|22.532806324110698|\n", "| stddev| 8.601545105332491| 23.32245299451514| 6.860352940897589|0.2539940413404101|0.11587767566755584|0.7026171434153232|28.148861406903595| 2.10571012662761|8.707259384239366|168.53711605495903|2.1649455237144455| 91.29486438415782| 7.141061511348571| 9.197104087379815|\n", "| min| 0.00632| 0.0| 0.46| 0.0| 0.385| 3.561| 2.9| 1.1296| 1.0| 187.0| 12.6| 0.32| 1.73| 5.0|\n", "| max| 88.9762| 100.0| 27.74| 1.0| 0.871| 8.78| 100.0| 12.1265| 24.0| 711.0| 22.0| 396.9| 37.97| 50.0|\n", "+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+\n", "\n" ] } ], "source": [ "df_boston.describe().show()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n", "| CRIM| ZN|INDUS|CHAS| NOX| RM| AGE| DIS|RAD| TAX|PTRATIO| B|LSTAT|price|\n", "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n", "|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2| 4.09|1.0|296.0| 15.3| 396.9| 4.98| 24.0|\n", "|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0| 17.8| 396.9| 9.14| 21.6|\n", "|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0| 17.8|392.83| 4.03| 34.7|\n", "|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0| 18.7|394.63| 2.94| 33.4|\n", "|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0| 18.7| 396.9| 5.33| 36.2|\n", "|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0| 18.7|394.12| 5.21| 28.7|\n", "|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0| 15.2| 395.6|12.43| 22.9|\n", "|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0| 15.2| 396.9|19.15| 27.1|\n", "|0.21124|12.5| 7.87| 0.0|0.524|5.631|100.0|6.0821|5.0|311.0| 15.2|386.63|29.93| 16.5|\n", "|0.17004|12.5| 7.87| 0.0|0.524|6.004| 85.9|6.5921|5.0|311.0| 15.2|386.71| 17.1| 18.9|\n", "|0.22489|12.5| 7.87| 0.0|0.524|6.377| 94.3|6.3467|5.0|311.0| 15.2|392.52|20.45| 15.0|\n", "|0.11747|12.5| 7.87| 0.0|0.524|6.009| 82.9|6.2267|5.0|311.0| 15.2| 396.9|13.27| 18.9|\n", "|0.09378|12.5| 7.87| 0.0|0.524|5.889| 39.0|5.4509|5.0|311.0| 15.2| 390.5|15.71| 21.7|\n", "|0.62976| 0.0| 8.14| 0.0|0.538|5.949| 61.8|4.7075|4.0|307.0| 21.0| 396.9| 8.26| 20.4|\n", "|0.63796| 0.0| 8.14| 0.0|0.538|6.096| 84.5|4.4619|4.0|307.0| 21.0|380.02|10.26| 18.2|\n", "|0.62739| 0.0| 8.14| 0.0|0.538|5.834| 56.5|4.4986|4.0|307.0| 21.0|395.62| 8.47| 19.9|\n", "|1.05393| 0.0| 8.14| 0.0|0.538|5.935| 29.3|4.4986|4.0|307.0| 21.0|386.85| 6.58| 23.1|\n", "| 0.7842| 0.0| 8.14| 0.0|0.538| 5.99| 81.7|4.2579|4.0|307.0| 21.0|386.75|14.67| 17.5|\n", "|0.80271| 0.0| 8.14| 0.0|0.538|5.456| 36.6|3.7965|4.0|307.0| 21.0|288.99|11.69| 20.2|\n", "| 0.7258| 0.0| 8.14| 0.0|0.538|5.727| 69.5|3.7965|4.0|307.0| 21.0|390.95|11.28| 18.2|\n", "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "df_boston.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2) Manually set DataFrame schema" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from pyspark.sql.types import (StructField, StructType,\n", " IntegerType, StringType, LongType)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "data_schema = [StructField('CRIM',StringType(), True )]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "final_struc = StructType(fields= data_schema)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# reload the csv BUT WITH PRE-DEFINED SCHEMA AS ABOVE \n", "df_boston_updated = sqlContext.read\\\n", " .format('com.databricks.spark.csv')\\\n", " .options(header='true', inferschema='true', shema= final_struc)\\\n", " .load('boston.csv')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- CRIM: double (nullable = true)\n", " |-- ZN: double (nullable = true)\n", " |-- INDUS: double (nullable = true)\n", " |-- CHAS: double (nullable = true)\n", " |-- NOX: double (nullable = true)\n", " |-- RM: double (nullable = true)\n", " |-- AGE: double (nullable = true)\n", " |-- DIS: double (nullable = true)\n", " |-- RAD: double (nullable = true)\n", " |-- TAX: double (nullable = true)\n", " |-- PTRATIO: double (nullable = true)\n", " |-- B: double (nullable = true)\n", " |-- LSTAT: double (nullable = true)\n", " |-- price: double (nullable = true)\n", "\n" ] } ], "source": [ "df_boston_updated.printSchema()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3) Select one column from Spark DF " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pyspark.sql.column.Column" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pandas way \n", "# which is not working here \n", "type(df_boston['CRIM'])\n", "# df_boston['CRIM'].show() <--- not work" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pyspark.sql.dataframe.DataFrame" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pyspark way \n", "type(df_boston.select('CRIM'))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+\n", "| CRIM|\n", "+-------+\n", "|0.00632|\n", "|0.02731|\n", "|0.02729|\n", "|0.03237|\n", "|0.06905|\n", "|0.02985|\n", "|0.08829|\n", "|0.14455|\n", "|0.21124|\n", "|0.17004|\n", "|0.22489|\n", "|0.11747|\n", "|0.09378|\n", "|0.62976|\n", "|0.63796|\n", "|0.62739|\n", "|1.05393|\n", "| 0.7842|\n", "|0.80271|\n", "| 0.7258|\n", "+-------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# SELECT 1 coluumn\n", "df_boston.select('CRIM').show()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+------+\n", "| CRIM| B|\n", "+-------+------+\n", "|0.00632| 396.9|\n", "|0.02731| 396.9|\n", "|0.02729|392.83|\n", "|0.03237|394.63|\n", "|0.06905| 396.9|\n", "|0.02985|394.12|\n", "|0.08829| 395.6|\n", "|0.14455| 396.9|\n", "|0.21124|386.63|\n", "|0.17004|386.71|\n", "|0.22489|392.52|\n", "|0.11747| 396.9|\n", "|0.09378| 390.5|\n", "|0.62976| 396.9|\n", "|0.63796|380.02|\n", "|0.62739|395.62|\n", "|1.05393|386.85|\n", "| 0.7842|386.75|\n", "|0.80271|288.99|\n", "| 0.7258|390.95|\n", "+-------+------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# SELECT multiple coluumns\n", "df_boston.select(['CRIM','B']).show()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Row(CRIM=0.00632, ZN=18.0, INDUS=2.31, CHAS=0.0, NOX=0.538, RM=6.575, AGE=65.2, DIS=4.09, RAD=1.0, TAX=296.0, PTRATIO=15.3, B=396.9, LSTAT=4.98, price=24.0),\n", " Row(CRIM=0.02731, ZN=0.0, INDUS=7.07, CHAS=0.0, NOX=0.469, RM=6.421, AGE=78.9, DIS=4.9671, RAD=2.0, TAX=242.0, PTRATIO=17.8, B=396.9, LSTAT=9.14, price=21.6)]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_boston.head(2)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Row(CRIM=0.00632, ZN=18.0, INDUS=2.31, CHAS=0.0, NOX=0.538, RM=6.575, AGE=65.2, DIS=4.09, RAD=1.0, TAX=296.0, PTRATIO=15.3, B=396.9, LSTAT=4.98, price=24.0)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_boston.head(2)[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4) Create a new column" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+\n", "| CRIM| ZN|INDUS|CHAS| NOX| RM| AGE| DIS|RAD| TAX|PTRATIO| B|LSTAT|price| colX|\n", "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+\n", "|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2| 4.09|1.0|296.0| 15.3| 396.9| 4.98| 24.0| 396.9|\n", "|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0| 17.8| 396.9| 9.14| 21.6| 396.9|\n", "|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0| 17.8|392.83| 4.03| 34.7|392.83|\n", "|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0| 18.7|394.63| 2.94| 33.4|394.63|\n", "|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0| 18.7| 396.9| 5.33| 36.2| 396.9|\n", "|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0| 18.7|394.12| 5.21| 28.7|394.12|\n", "|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0| 15.2| 395.6|12.43| 22.9| 395.6|\n", "|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0| 15.2| 396.9|19.15| 27.1| 396.9|\n", "|0.21124|12.5| 7.87| 0.0|0.524|5.631|100.0|6.0821|5.0|311.0| 15.2|386.63|29.93| 16.5|386.63|\n", "|0.17004|12.5| 7.87| 0.0|0.524|6.004| 85.9|6.5921|5.0|311.0| 15.2|386.71| 17.1| 18.9|386.71|\n", "|0.22489|12.5| 7.87| 0.0|0.524|6.377| 94.3|6.3467|5.0|311.0| 15.2|392.52|20.45| 15.0|392.52|\n", "|0.11747|12.5| 7.87| 0.0|0.524|6.009| 82.9|6.2267|5.0|311.0| 15.2| 396.9|13.27| 18.9| 396.9|\n", "|0.09378|12.5| 7.87| 0.0|0.524|5.889| 39.0|5.4509|5.0|311.0| 15.2| 390.5|15.71| 21.7| 390.5|\n", "|0.62976| 0.0| 8.14| 0.0|0.538|5.949| 61.8|4.7075|4.0|307.0| 21.0| 396.9| 8.26| 20.4| 396.9|\n", "|0.63796| 0.0| 8.14| 0.0|0.538|6.096| 84.5|4.4619|4.0|307.0| 21.0|380.02|10.26| 18.2|380.02|\n", "|0.62739| 0.0| 8.14| 0.0|0.538|5.834| 56.5|4.4986|4.0|307.0| 21.0|395.62| 8.47| 19.9|395.62|\n", "|1.05393| 0.0| 8.14| 0.0|0.538|5.935| 29.3|4.4986|4.0|307.0| 21.0|386.85| 6.58| 23.1|386.85|\n", "| 0.7842| 0.0| 8.14| 0.0|0.538| 5.99| 81.7|4.2579|4.0|307.0| 21.0|386.75|14.67| 17.5|386.75|\n", "|0.80271| 0.0| 8.14| 0.0|0.538|5.456| 36.6|3.7965|4.0|307.0| 21.0|288.99|11.69| 20.2|288.99|\n", "| 0.7258| 0.0| 8.14| 0.0|0.538|5.727| 69.5|3.7965|4.0|307.0| 21.0|390.95|11.28| 18.2|390.95|\n", "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# add new column : colX\n", "df_boston.withColumn('colX', df_boston['B']).show()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+\n", "| CRIM| ZN|INDUS|CHAS| NOX| RM| AGE| DIS|RAD| TAX|PTRATIO| B|LSTAT|price| colY|\n", "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+\n", "|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2| 4.09|1.0|296.0| 15.3| 396.9| 4.98| 24.0| 793.8|\n", "|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0| 17.8| 396.9| 9.14| 21.6| 793.8|\n", "|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0| 17.8|392.83| 4.03| 34.7|785.66|\n", "|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0| 18.7|394.63| 2.94| 33.4|789.26|\n", "|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0| 18.7| 396.9| 5.33| 36.2| 793.8|\n", "|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0| 18.7|394.12| 5.21| 28.7|788.24|\n", "|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0| 15.2| 395.6|12.43| 22.9| 791.2|\n", "|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0| 15.2| 396.9|19.15| 27.1| 793.8|\n", "|0.21124|12.5| 7.87| 0.0|0.524|5.631|100.0|6.0821|5.0|311.0| 15.2|386.63|29.93| 16.5|773.26|\n", "|0.17004|12.5| 7.87| 0.0|0.524|6.004| 85.9|6.5921|5.0|311.0| 15.2|386.71| 17.1| 18.9|773.42|\n", "|0.22489|12.5| 7.87| 0.0|0.524|6.377| 94.3|6.3467|5.0|311.0| 15.2|392.52|20.45| 15.0|785.04|\n", "|0.11747|12.5| 7.87| 0.0|0.524|6.009| 82.9|6.2267|5.0|311.0| 15.2| 396.9|13.27| 18.9| 793.8|\n", "|0.09378|12.5| 7.87| 0.0|0.524|5.889| 39.0|5.4509|5.0|311.0| 15.2| 390.5|15.71| 21.7| 781.0|\n", "|0.62976| 0.0| 8.14| 0.0|0.538|5.949| 61.8|4.7075|4.0|307.0| 21.0| 396.9| 8.26| 20.4| 793.8|\n", "|0.63796| 0.0| 8.14| 0.0|0.538|6.096| 84.5|4.4619|4.0|307.0| 21.0|380.02|10.26| 18.2|760.04|\n", "|0.62739| 0.0| 8.14| 0.0|0.538|5.834| 56.5|4.4986|4.0|307.0| 21.0|395.62| 8.47| 19.9|791.24|\n", "|1.05393| 0.0| 8.14| 0.0|0.538|5.935| 29.3|4.4986|4.0|307.0| 21.0|386.85| 6.58| 23.1| 773.7|\n", "| 0.7842| 0.0| 8.14| 0.0|0.538| 5.99| 81.7|4.2579|4.0|307.0| 21.0|386.75|14.67| 17.5| 773.5|\n", "|0.80271| 0.0| 8.14| 0.0|0.538|5.456| 36.6|3.7965|4.0|307.0| 21.0|288.99|11.69| 20.2|577.98|\n", "| 0.7258| 0.0| 8.14| 0.0|0.538|5.727| 69.5|3.7965|4.0|307.0| 21.0|390.95|11.28| 18.2| 781.9|\n", "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# add new column : colY\n", "df_boston.withColumn('colY', df_boston['B']*2).show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5) Rename columns " ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n", "| CRIM| ZN|INDUS|CHAS| NOX| RM| AGE| DIS|RAD| TAX|PTRATIO| BBB|LSTAT|price|\n", "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n", "|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2| 4.09|1.0|296.0| 15.3| 396.9| 4.98| 24.0|\n", "|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0| 17.8| 396.9| 9.14| 21.6|\n", "|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0| 17.8|392.83| 4.03| 34.7|\n", "|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0| 18.7|394.63| 2.94| 33.4|\n", "|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0| 18.7| 396.9| 5.33| 36.2|\n", "|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0| 18.7|394.12| 5.21| 28.7|\n", "|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0| 15.2| 395.6|12.43| 22.9|\n", "|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0| 15.2| 396.9|19.15| 27.1|\n", "|0.21124|12.5| 7.87| 0.0|0.524|5.631|100.0|6.0821|5.0|311.0| 15.2|386.63|29.93| 16.5|\n", "|0.17004|12.5| 7.87| 0.0|0.524|6.004| 85.9|6.5921|5.0|311.0| 15.2|386.71| 17.1| 18.9|\n", "|0.22489|12.5| 7.87| 0.0|0.524|6.377| 94.3|6.3467|5.0|311.0| 15.2|392.52|20.45| 15.0|\n", "|0.11747|12.5| 7.87| 0.0|0.524|6.009| 82.9|6.2267|5.0|311.0| 15.2| 396.9|13.27| 18.9|\n", "|0.09378|12.5| 7.87| 0.0|0.524|5.889| 39.0|5.4509|5.0|311.0| 15.2| 390.5|15.71| 21.7|\n", "|0.62976| 0.0| 8.14| 0.0|0.538|5.949| 61.8|4.7075|4.0|307.0| 21.0| 396.9| 8.26| 20.4|\n", "|0.63796| 0.0| 8.14| 0.0|0.538|6.096| 84.5|4.4619|4.0|307.0| 21.0|380.02|10.26| 18.2|\n", "|0.62739| 0.0| 8.14| 0.0|0.538|5.834| 56.5|4.4986|4.0|307.0| 21.0|395.62| 8.47| 19.9|\n", "|1.05393| 0.0| 8.14| 0.0|0.538|5.935| 29.3|4.4986|4.0|307.0| 21.0|386.85| 6.58| 23.1|\n", "| 0.7842| 0.0| 8.14| 0.0|0.538| 5.99| 81.7|4.2579|4.0|307.0| 21.0|386.75|14.67| 17.5|\n", "|0.80271| 0.0| 8.14| 0.0|0.538|5.456| 36.6|3.7965|4.0|307.0| 21.0|288.99|11.69| 20.2|\n", "| 0.7258| 0.0| 8.14| 0.0|0.538|5.727| 69.5|3.7965|4.0|307.0| 21.0|390.95|11.28| 18.2|\n", "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# rename column \"B\" -> \"BBB\"\n", "df_boston.withColumnRenamed('B', 'BBB').show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6) Pyspark SQL" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# create a temp SQL view from df \n", "df_boston.createOrReplaceTempView('BOSTON')" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+\n", "| B|\n", "+------+\n", "| 396.9|\n", "| 396.9|\n", "|392.83|\n", "|394.63|\n", "| 396.9|\n", "|394.12|\n", "| 395.6|\n", "| 396.9|\n", "|386.63|\n", "|386.71|\n", "+------+\n", "\n" ] } ], "source": [ "result=sqlContext.sql(\"select B from BOSTON LIMIT 10\").show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7) Filter data" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n", "| CRIM| ZN|INDUS|CHAS| NOX| RM| AGE| DIS| RAD| TAX|PTRATIO| B|LSTAT|price|\n", "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n", "|51.1358|0.0| 18.1| 0.0|0.597|5.757|100.0| 1.413|24.0|666.0| 20.2| 2.6|10.11| 15.0|\n", "|14.0507|0.0| 18.1| 0.0|0.597|6.657|100.0|1.5275|24.0|666.0| 20.2|35.05|21.22| 17.2|\n", "| 18.811|0.0| 18.1| 0.0|0.597|4.628|100.0|1.5539|24.0|666.0| 20.2|28.79|34.37| 17.9|\n", "|18.0846|0.0| 18.1| 0.0|0.679|6.434|100.0|1.8347|24.0|666.0| 20.2|27.25|29.05| 7.2|\n", "|10.8342|0.0| 18.1| 0.0|0.679|6.782| 90.8|1.8195|24.0|666.0| 20.2|21.57|25.79| 7.5|\n", "|73.5341|0.0| 18.1| 0.0|0.679|5.957|100.0|1.8026|24.0|666.0| 20.2|16.45|20.62| 8.8|\n", "|11.8123|0.0| 18.1| 0.0|0.718|6.824| 76.5| 1.794|24.0|666.0| 20.2|48.45|22.74| 8.4|\n", "|7.05042|0.0| 18.1| 0.0|0.614|6.103| 85.1|2.0218|24.0|666.0| 20.2| 2.52|23.29| 13.4|\n", "|8.79212|0.0| 18.1| 0.0|0.584|5.565| 70.6|2.0635|24.0|666.0| 20.2| 3.65|17.16| 11.7|\n", "|15.8603|0.0| 18.1| 0.0|0.679|5.896| 95.4|1.9096|24.0|666.0| 20.2| 7.68|24.39| 8.3|\n", "|12.2472|0.0| 18.1| 0.0|0.584|5.837| 59.7|1.9976|24.0|666.0| 20.2|24.65|15.69| 10.2|\n", "|37.6619|0.0| 18.1| 0.0|0.679|6.202| 78.7|1.8629|24.0|666.0| 20.2|18.82|14.52| 10.9|\n", "|14.4208|0.0| 18.1| 0.0| 0.74|6.461| 93.3|2.0026|24.0|666.0| 20.2|27.49|18.05| 9.6|\n", "|15.1772|0.0| 18.1| 0.0| 0.74|6.152|100.0|1.9142|24.0|666.0| 20.2| 9.32|26.45| 8.7|\n", "|10.6718|0.0| 18.1| 0.0| 0.74|6.459| 94.8|1.9879|24.0|666.0| 20.2|43.06|23.98| 11.8|\n", "|6.71772|0.0| 18.1| 0.0|0.713|6.749| 92.6|2.3236|24.0|666.0| 20.2| 0.32|17.44| 13.4|\n", "|9.51363|0.0| 18.1| 0.0|0.713|6.728| 94.1|2.4961|24.0|666.0| 20.2| 6.68|18.71| 14.9|\n", "|4.66883|0.0| 18.1| 0.0|0.713|5.976| 87.9|2.5806|24.0|666.0| 20.2|10.48|19.01| 12.7|\n", "|8.20058|0.0| 18.1| 0.0|0.713|5.936| 80.3|2.7792|24.0|666.0| 20.2| 3.5|16.94| 13.5|\n", "|3.77498|0.0| 18.1| 0.0|0.655|5.952| 84.7|2.8715|24.0|666.0| 20.2|22.01|17.15| 19.0|\n", "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n", "\n" ] } ], "source": [ "result2=df_boston.filter(\"B < 50\").show()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----+\n", "| B|\n", "+-----+\n", "| 2.6|\n", "|35.05|\n", "|28.79|\n", "|27.25|\n", "|21.57|\n", "|16.45|\n", "|48.45|\n", "| 2.52|\n", "| 3.65|\n", "| 7.68|\n", "|24.65|\n", "|18.82|\n", "|27.49|\n", "| 9.32|\n", "|43.06|\n", "| 0.32|\n", "| 6.68|\n", "|10.48|\n", "| 3.5|\n", "|22.01|\n", "+-----+\n", "\n" ] } ], "source": [ "result2=df_boston.filter(\"B < 50\").select(['B']).show()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n", "| CRIM| ZN|INDUS|CHAS| NOX| RM| AGE| DIS| RAD| TAX|PTRATIO| B|LSTAT|price|\n", "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n", "|51.1358|0.0| 18.1| 0.0|0.597|5.757|100.0| 1.413|24.0|666.0| 20.2| 2.6|10.11| 15.0|\n", "|14.0507|0.0| 18.1| 0.0|0.597|6.657|100.0|1.5275|24.0|666.0| 20.2|35.05|21.22| 17.2|\n", "| 18.811|0.0| 18.1| 0.0|0.597|4.628|100.0|1.5539|24.0|666.0| 20.2|28.79|34.37| 17.9|\n", "|18.0846|0.0| 18.1| 0.0|0.679|6.434|100.0|1.8347|24.0|666.0| 20.2|27.25|29.05| 7.2|\n", "|10.8342|0.0| 18.1| 0.0|0.679|6.782| 90.8|1.8195|24.0|666.0| 20.2|21.57|25.79| 7.5|\n", "|73.5341|0.0| 18.1| 0.0|0.679|5.957|100.0|1.8026|24.0|666.0| 20.2|16.45|20.62| 8.8|\n", "|11.8123|0.0| 18.1| 0.0|0.718|6.824| 76.5| 1.794|24.0|666.0| 20.2|48.45|22.74| 8.4|\n", "|7.05042|0.0| 18.1| 0.0|0.614|6.103| 85.1|2.0218|24.0|666.0| 20.2| 2.52|23.29| 13.4|\n", "|8.79212|0.0| 18.1| 0.0|0.584|5.565| 70.6|2.0635|24.0|666.0| 20.2| 3.65|17.16| 11.7|\n", "|15.8603|0.0| 18.1| 0.0|0.679|5.896| 95.4|1.9096|24.0|666.0| 20.2| 7.68|24.39| 8.3|\n", "|12.2472|0.0| 18.1| 0.0|0.584|5.837| 59.7|1.9976|24.0|666.0| 20.2|24.65|15.69| 10.2|\n", "|37.6619|0.0| 18.1| 0.0|0.679|6.202| 78.7|1.8629|24.0|666.0| 20.2|18.82|14.52| 10.9|\n", "|14.4208|0.0| 18.1| 0.0| 0.74|6.461| 93.3|2.0026|24.0|666.0| 20.2|27.49|18.05| 9.6|\n", "|15.1772|0.0| 18.1| 0.0| 0.74|6.152|100.0|1.9142|24.0|666.0| 20.2| 9.32|26.45| 8.7|\n", "|10.6718|0.0| 18.1| 0.0| 0.74|6.459| 94.8|1.9879|24.0|666.0| 20.2|43.06|23.98| 11.8|\n", "|6.71772|0.0| 18.1| 0.0|0.713|6.749| 92.6|2.3236|24.0|666.0| 20.2| 0.32|17.44| 13.4|\n", "|9.51363|0.0| 18.1| 0.0|0.713|6.728| 94.1|2.4961|24.0|666.0| 20.2| 6.68|18.71| 14.9|\n", "|4.66883|0.0| 18.1| 0.0|0.713|5.976| 87.9|2.5806|24.0|666.0| 20.2|10.48|19.01| 12.7|\n", "|8.20058|0.0| 18.1| 0.0|0.713|5.936| 80.3|2.7792|24.0|666.0| 20.2| 3.5|16.94| 13.5|\n", "|3.77498|0.0| 18.1| 0.0|0.655|5.952| 84.7|2.8715|24.0|666.0| 20.2|22.01|17.15| 19.0|\n", "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n", "\n" ] } ], "source": [ "# python way \n", "result2=df_boston.filter(df_boston['B']<50).show()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n", "| CRIM| ZN|INDUS|CHAS| NOX| RM| AGE| DIS| RAD| TAX|PTRATIO| B|LSTAT|price|\n", "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n", "|14.0507|0.0| 18.1| 0.0|0.597|6.657|100.0|1.5275|24.0|666.0| 20.2|35.05|21.22| 17.2|\n", "| 18.811|0.0| 18.1| 0.0|0.597|4.628|100.0|1.5539|24.0|666.0| 20.2|28.79|34.37| 17.9|\n", "|18.0846|0.0| 18.1| 0.0|0.679|6.434|100.0|1.8347|24.0|666.0| 20.2|27.25|29.05| 7.2|\n", "|10.8342|0.0| 18.1| 0.0|0.679|6.782| 90.8|1.8195|24.0|666.0| 20.2|21.57|25.79| 7.5|\n", "|73.5341|0.0| 18.1| 0.0|0.679|5.957|100.0|1.8026|24.0|666.0| 20.2|16.45|20.62| 8.8|\n", "|11.8123|0.0| 18.1| 0.0|0.718|6.824| 76.5| 1.794|24.0|666.0| 20.2|48.45|22.74| 8.4|\n", "|12.2472|0.0| 18.1| 0.0|0.584|5.837| 59.7|1.9976|24.0|666.0| 20.2|24.65|15.69| 10.2|\n", "|37.6619|0.0| 18.1| 0.0|0.679|6.202| 78.7|1.8629|24.0|666.0| 20.2|18.82|14.52| 10.9|\n", "|14.4208|0.0| 18.1| 0.0| 0.74|6.461| 93.3|2.0026|24.0|666.0| 20.2|27.49|18.05| 9.6|\n", "|10.6718|0.0| 18.1| 0.0| 0.74|6.459| 94.8|1.9879|24.0|666.0| 20.2|43.06|23.98| 11.8|\n", "|4.66883|0.0| 18.1| 0.0|0.713|5.976| 87.9|2.5806|24.0|666.0| 20.2|10.48|19.01| 12.7|\n", "|3.77498|0.0| 18.1| 0.0|0.655|5.952| 84.7|2.8715|24.0|666.0| 20.2|22.01|17.15| 19.0|\n", "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+-----+-----+-----+\n", "\n" ] } ], "source": [ "##### filter in multiple conditions #####\n", "\n", "df_boston.filter((df_boston['B'] < 50 ) & (df_boston['B'] > 10 ) ).show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 8) Get filter spark data as dict, array.. and process on them" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "filer_result=df_boston.filter((df_boston['B'] < 50 ) & (df_boston['B'] > 10 ) ).collect()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Row(CRIM=14.0507, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.597, RM=6.657, AGE=100.0, DIS=1.5275, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=35.05, LSTAT=21.22, price=17.2),\n", " Row(CRIM=18.811, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.597, RM=4.628, AGE=100.0, DIS=1.5539, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=28.79, LSTAT=34.37, price=17.9),\n", " Row(CRIM=18.0846, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.679, RM=6.434, AGE=100.0, DIS=1.8347, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=27.25, LSTAT=29.05, price=7.2),\n", " Row(CRIM=10.8342, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.679, RM=6.782, AGE=90.8, DIS=1.8195, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=21.57, LSTAT=25.79, price=7.5),\n", " Row(CRIM=73.5341, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.679, RM=5.957, AGE=100.0, DIS=1.8026, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=16.45, LSTAT=20.62, price=8.8),\n", " Row(CRIM=11.8123, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.718, RM=6.824, AGE=76.5, DIS=1.794, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=48.45, LSTAT=22.74, price=8.4),\n", " Row(CRIM=12.2472, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.584, RM=5.837, AGE=59.7, DIS=1.9976, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=24.65, LSTAT=15.69, price=10.2),\n", " Row(CRIM=37.6619, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.679, RM=6.202, AGE=78.7, DIS=1.8629, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=18.82, LSTAT=14.52, price=10.9),\n", " Row(CRIM=14.4208, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.74, RM=6.461, AGE=93.3, DIS=2.0026, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=27.49, LSTAT=18.05, price=9.6),\n", " Row(CRIM=10.6718, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.74, RM=6.459, AGE=94.8, DIS=1.9879, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=43.06, LSTAT=23.98, price=11.8),\n", " Row(CRIM=4.66883, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.713, RM=5.976, AGE=87.9, DIS=2.5806, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=10.48, LSTAT=19.01, price=12.7),\n", " Row(CRIM=3.77498, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.655, RM=5.952, AGE=84.7, DIS=2.8715, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=22.01, LSTAT=17.15, price=19.0)]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filer_result" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "list" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(filer_result)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Row(CRIM=14.0507, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.597, RM=6.657, AGE=100.0, DIS=1.5275, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=35.05, LSTAT=21.22, price=17.2)" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filer_result[0]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'AGE': 100.0,\n", " 'B': 35.05,\n", " 'CHAS': 0.0,\n", " 'CRIM': 14.0507,\n", " 'DIS': 1.5275,\n", " 'INDUS': 18.1,\n", " 'LSTAT': 21.22,\n", " 'NOX': 0.597,\n", " 'PTRATIO': 20.2,\n", " 'RAD': 24.0,\n", " 'RM': 6.657,\n", " 'TAX': 666.0,\n", " 'ZN': 0.0,\n", " 'price': 17.2}" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filer_result[0].asDict()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "100.0" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filer_result[0].asDict()['AGE']" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "# END OF COURSE 8.26 \n", "# next 8.27" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 9) Spark df groupby " ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_boston.groupby(\"RAD\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DataFrame[RAD: double, avg(CRIM): double, avg(ZN): double, avg(INDUS): double, avg(CHAS): double, avg(NOX): double, avg(RM): double, avg(AGE): double, avg(DIS): double, avg(RAD): double, avg(TAX): double, avg(PTRATIO): double, avg(B): double, avg(LSTAT): double, avg(price): double]" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_boston.groupby(\"RAD\").mean()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+--------------------+------------------+------------------+-------------------+-------------------+------------------+-----------------+------------------+--------+------------------+------------------+------------------+------------------+------------------+\n", "| RAD| avg(CRIM)| avg(ZN)| avg(INDUS)| avg(CHAS)| avg(NOX)| avg(RM)| avg(AGE)| avg(DIS)|avg(RAD)| avg(TAX)| avg(PTRATIO)| avg(B)| avg(LSTAT)| avg(price)|\n", "+----+--------------------+------------------+------------------+-------------------+-------------------+------------------+-----------------+------------------+--------+------------------+------------------+------------------+------------------+------------------+\n", "| 8.0| 0.3714095833333333| 6.25| 5.9325|0.20833333333333334|0.49249999999999977|6.9507916666666665| 67.35| 4.410604166666666| 8.0| 301.25| 17.97499999999999| 385.2775|7.9608333333333325|30.358333333333334|\n", "| 7.0| 0.15039941176470584|26.705882352941178| 5.034705882352943| 0.0| 0.4409999999999999| 6.647764705882355|40.14117647058823| 6.495858823529412| 7.0| 304.4117647058824| 18.40588235294117| 388.4329411764706| 7.98823529411765|27.105882352941173|\n", "| 1.0|0.036028500000000005| 39.9| 5.066000000000001| 0.05| 0.4628900000000001| 6.555850000000001| 45.025| 6.027824999999999| 1.0| 291.45|17.564999999999998|389.27299999999997| 7.369999999999999|24.364999999999995|\n", "| 4.0| 0.39390345454545467|14.731818181818182|10.746636363636362|0.07272727272727272| 0.5043109090909086|6.1331181818181815|60.84181818181818| 4.433| 4.0| 335.9818181818182|19.136363636363644| 382.721272727273|12.199090909090906| 21.38727272727272|\n", "| 3.0| 0.09735947368421054| 16.38157894736842| 4.41894736842105|0.05263157894736842| 0.4524236842105263| 6.473026315789473|49.31052631578947| 5.146492105263157| 3.0|246.07894736842104|18.168421052631583| 392.417894736842| 9.076052631578946|27.928947368421053|\n", "| 2.0| 0.08328541666666667|20.416666666666668| 9.630833333333337| 0.0|0.48491666666666644| 6.649958333333335|64.77083333333333| 4.097008333333332| 2.0| 260.6666666666667| 17.29166666666667| 386.41375|10.024583333333332| 26.83333333333333|\n", "| 6.0| 0.15005384615384618| 12.98076923076923| 8.198076923076924| 0.0| 0.5148461538461541| 6.104730769230769|60.14230769230768| 4.024915384615384| 6.0|372.88461538461536|17.815384615384616| 387.3665384615384|12.305769230769231|20.976923076923075|\n", "| 5.0| 0.6877916521739129|11.108695652173912| 9.759826086956538|0.09565217391304348| 0.5708834782608687| 6.39315652173913|69.22347826086958|3.6973286956521743| 5.0| 331.8434782608696| 16.54695652173914|369.19243478260887|10.657217391304345| 25.70695652173912|\n", "|24.0| 12.759290909090915| 0.0|18.099999999999955|0.06060606060606061| 0.6724166666666664| 6.022151515151517|89.80530303030304|2.0612537878787878| 24.0| 666.0|20.199999999999985|288.08916666666687|18.600606060606058| 16.40378787878788|\n", "+----+--------------------+------------------+------------------+-------------------+-------------------+------------------+-----------------+------------------+--------+------------------+------------------+------------------+------------------+------------------+\n", "\n" ] } ], "source": [ "df_boston.groupby(\"RAD\").mean().show()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----------------+\n", "| avg(RAD)|\n", "+-----------------+\n", "|9.549407114624506|\n", "+-----------------+\n", "\n" ] } ], "source": [ "# agg \n", "df_boston.agg({'RAD':'mean'}).show()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+-------+\n", "| RAD|max(ZN)|\n", "+----+-------+\n", "| 8.0| 25.0|\n", "| 7.0| 34.0|\n", "| 1.0| 90.0|\n", "| 4.0| 95.0|\n", "| 3.0| 95.0|\n", "| 2.0| 85.0|\n", "| 6.0| 52.5|\n", "| 5.0| 100.0|\n", "|24.0| 0.0|\n", "+----+-------+\n", "\n" ] } ], "source": [ "# group + agg \n", "group_data = df_boston.groupby('RAD')\n", "group_data.agg({'ZN': 'max'}).show()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+--------+\n", "| RAD|max(NOX)|\n", "+----+--------+\n", "| 8.0| 0.507|\n", "| 7.0| 0.472|\n", "| 1.0| 0.573|\n", "| 4.0| 0.624|\n", "| 3.0| 0.488|\n", "| 2.0| 0.581|\n", "| 6.0| 0.585|\n", "| 5.0| 0.871|\n", "|24.0| 0.77|\n", "+----+--------+\n", "\n", "+----+-------+\n", "| RAD|max(RM)|\n", "+----+-------+\n", "| 8.0| 8.725|\n", "| 7.0| 8.259|\n", "| 1.0| 7.923|\n", "| 4.0| 8.034|\n", "| 3.0| 7.831|\n", "| 2.0| 8.069|\n", "| 6.0| 6.897|\n", "| 5.0| 8.704|\n", "|24.0| 8.78|\n", "+----+-------+\n", "\n", "+----+--------+\n", "| RAD|max(AGE)|\n", "+----+--------+\n", "| 8.0| 93.4|\n", "| 7.0| 79.2|\n", "| 1.0| 91.0|\n", "| 4.0| 100.0|\n", "| 3.0| 95.6|\n", "| 2.0| 97.0|\n", "| 6.0| 95.4|\n", "| 5.0| 100.0|\n", "|24.0| 100.0|\n", "+----+--------+\n", "\n" ] } ], "source": [ "# group + agg for loop -- general case \n", "\n", "group_data = df_boston.groupby('RAD')\n", "for col in ['NOX','RM','AGE']:\n", " group_data.agg({col: 'max'}).show()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "|count(DISTINCT CRIM)|\n", "+--------------------+\n", "| 504|\n", "+--------------------+\n", "\n" ] } ], "source": [ "# import spark default func : countDistinct, avg, stddev\n", "\n", "df_boston.select(countDistinct('CRIM')).show()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------------------+\n", "| avg(CRIM)|\n", "+------------------+\n", "|3.6135235573122535|\n", "+------------------+\n", "\n" ] } ], "source": [ "df_boston.select(avg('CRIM')).show()\n", "\n", "# compare with the groupby method \n", "# df_boston.agg({'CRIM':'mean'}).show()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+\n", "|AVG_CRIM|\n", "+--------+\n", "| 504|\n", "+--------+\n", "\n" ] } ], "source": [ "# rename the result col \n", "df_boston.select(countDistinct('CRIM').alias('AVG_CRIM')).show()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----------------+\n", "| std|\n", "+-----------------+\n", "|8.601545105332491|\n", "+-----------------+\n", "\n", "None\n", "+---------------------+\n", "|format_number(std, 2)|\n", "+---------------------+\n", "| 8.60|\n", "+---------------------+\n", "\n", "None\n" ] } ], "source": [ "# format result digits \n", "\n", "crim_std = df_boston.select(stddev('CRIM').alias('std'))\n", "\n", "print (crim_std.show())\n", "\n", "print (crim_std.select(format_number('std',2)).show())" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+-----+-----+----+------+-----+----+-------+---+-----+-------+------+-----+-----+\n", "| CRIM| ZN|INDUS|CHAS| NOX| RM| AGE| DIS|RAD| TAX|PTRATIO| B|LSTAT|price|\n", "+-------+-----+-----+----+------+-----+----+-------+---+-----+-------+------+-----+-----+\n", "|0.00632| 18.0| 2.31| 0.0| 0.538|6.575|65.2| 4.09|1.0|296.0| 15.3| 396.9| 4.98| 24.0|\n", "|0.00906| 90.0| 2.97| 0.0| 0.4|7.088|20.8| 7.3073|1.0|285.0| 15.3|394.72| 7.85| 32.2|\n", "|0.01096| 55.0| 2.25| 0.0| 0.389|6.453|31.9| 7.3073|1.0|300.0| 15.3|394.72| 8.23| 22.0|\n", "|0.01301| 35.0| 1.52| 0.0| 0.442|7.241|49.3| 7.0379|1.0|284.0| 15.5|394.74| 5.49| 32.7|\n", "|0.01311| 90.0| 1.22| 0.0| 0.403|7.249|21.9| 8.6966|5.0|226.0| 17.9|395.93| 4.81| 35.4|\n", "| 0.0136| 75.0| 4.0| 0.0| 0.41|5.888|47.6| 7.3197|3.0|469.0| 21.1| 396.9| 14.8| 18.9|\n", "|0.01381| 80.0| 0.46| 0.0| 0.422|7.875|32.0| 5.6484|4.0|255.0| 14.4|394.23| 2.97| 50.0|\n", "|0.01432|100.0| 1.32| 0.0| 0.411|6.816|40.5| 8.3248|5.0|256.0| 15.1| 392.9| 3.95| 31.6|\n", "|0.01439| 60.0| 2.93| 0.0| 0.401|6.604|18.8| 6.2196|1.0|265.0| 15.6| 376.7| 4.38| 29.1|\n", "|0.01501| 90.0| 1.21| 1.0| 0.401|7.923|24.8| 5.885|1.0|198.0| 13.6|395.52| 3.16| 50.0|\n", "|0.01501| 80.0| 2.01| 0.0| 0.435|6.635|29.7| 8.344|4.0|280.0| 17.0|390.94| 5.99| 24.5|\n", "|0.01538| 90.0| 3.75| 0.0| 0.394|7.454|34.2| 6.3361|3.0|244.0| 15.9|386.34| 3.11| 44.0|\n", "|0.01709| 90.0| 2.02| 0.0| 0.41|6.728|36.1|12.1265|5.0|187.0| 17.0|384.46| 4.5| 30.1|\n", "|0.01778| 95.0| 1.47| 0.0| 0.403|7.135|13.9| 7.6534|3.0|402.0| 17.0| 384.3| 4.45| 32.9|\n", "| 0.0187| 85.0| 4.15| 0.0| 0.429|6.516|27.7| 8.5353|4.0|351.0| 17.9|392.43| 6.36| 23.1|\n", "|0.01951| 17.5| 1.38| 0.0|0.4161|7.104|59.5| 9.2229|3.0|216.0| 18.6|393.24| 8.05| 33.0|\n", "|0.01965| 80.0| 1.76| 0.0| 0.385| 6.23|31.5| 9.0892|1.0|241.0| 18.2| 341.6|12.93| 20.1|\n", "|0.02009| 95.0| 2.68| 0.0|0.4161|8.034|31.9| 5.118|4.0|224.0| 14.7|390.55| 2.88| 50.0|\n", "|0.02055| 85.0| 0.74| 0.0| 0.41|6.383|35.7| 9.1876|2.0|313.0| 17.3| 396.9| 5.77| 24.7|\n", "|0.02177| 82.5| 2.03| 0.0| 0.415| 7.61|15.7| 6.27|2.0|348.0| 14.7|395.38| 3.11| 42.3|\n", "+-------+-----+-----+----+------+-----+----+-------+---+-----+-------+------+-----+-----+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# order by \n", "\n", "df_boston.orderBy('CRIM').show()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+------+-----+-----+\n", "| CRIM| ZN|INDUS|CHAS| NOX| RM| AGE| DIS| RAD| TAX|PTRATIO| B|LSTAT|price|\n", "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+------+-----+-----+\n", "|88.9762|0.0| 18.1| 0.0|0.671|6.968| 91.9|1.4165|24.0|666.0| 20.2| 396.9|17.21| 10.4|\n", "|73.5341|0.0| 18.1| 0.0|0.679|5.957|100.0|1.8026|24.0|666.0| 20.2| 16.45|20.62| 8.8|\n", "|67.9208|0.0| 18.1| 0.0|0.693|5.683|100.0|1.4254|24.0|666.0| 20.2|384.97|22.98| 5.0|\n", "|51.1358|0.0| 18.1| 0.0|0.597|5.757|100.0| 1.413|24.0|666.0| 20.2| 2.6|10.11| 15.0|\n", "|45.7461|0.0| 18.1| 0.0|0.693|4.519|100.0|1.6582|24.0|666.0| 20.2| 88.27|36.98| 7.0|\n", "|41.5292|0.0| 18.1| 0.0|0.693|5.531| 85.4|1.6074|24.0|666.0| 20.2|329.46|27.38| 8.5|\n", "|38.3518|0.0| 18.1| 0.0|0.693|5.453|100.0|1.4896|24.0|666.0| 20.2| 396.9|30.59| 5.0|\n", "|37.6619|0.0| 18.1| 0.0|0.679|6.202| 78.7|1.8629|24.0|666.0| 20.2| 18.82|14.52| 10.9|\n", "|28.6558|0.0| 18.1| 0.0|0.597|5.155|100.0|1.5894|24.0|666.0| 20.2|210.97|20.08| 16.3|\n", "|25.9406|0.0| 18.1| 0.0|0.679|5.304| 89.1|1.6475|24.0|666.0| 20.2|127.36|26.64| 10.4|\n", "|25.0461|0.0| 18.1| 0.0|0.693|5.987|100.0|1.5888|24.0|666.0| 20.2| 396.9|26.77| 5.6|\n", "|24.8017|0.0| 18.1| 0.0|0.693|5.349| 96.0|1.7028|24.0|666.0| 20.2| 396.9|19.77| 8.3|\n", "|24.3938|0.0| 18.1| 0.0| 0.7|4.652|100.0|1.4672|24.0|666.0| 20.2| 396.9|28.28| 10.5|\n", "|23.6482|0.0| 18.1| 0.0|0.671| 6.38| 96.2|1.3861|24.0|666.0| 20.2| 396.9|23.69| 13.1|\n", "|22.5971|0.0| 18.1| 0.0| 0.7| 5.0| 89.5|1.5184|24.0|666.0| 20.2| 396.9|31.99| 7.4|\n", "|22.0511|0.0| 18.1| 0.0| 0.74|5.818| 92.4|1.8662|24.0|666.0| 20.2|391.45|22.11| 10.5|\n", "|20.7162|0.0| 18.1| 0.0|0.659|4.138|100.0|1.1781|24.0|666.0| 20.2|370.22|23.34| 11.9|\n", "|20.0849|0.0| 18.1| 0.0| 0.7|4.368| 91.2|1.4395|24.0|666.0| 20.2|285.83|30.63| 8.8|\n", "|19.6091|0.0| 18.1| 0.0|0.671|7.313| 97.9|1.3163|24.0|666.0| 20.2| 396.9|13.44| 15.0|\n", "| 18.811|0.0| 18.1| 0.0|0.597|4.628|100.0|1.5539|24.0|666.0| 20.2| 28.79|34.37| 17.9|\n", "+-------+---+-----+----+-----+-----+-----+------+----+-----+-------+------+-----+-----+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# order by : inverse order \n", "\n", "df_boston.orderBy(df_boston['CRIM'].desc()).show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 10) Missing data " ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n", "| CRIM| ZN|INDUS|CHAS| NOX| RM| AGE| DIS|RAD| TAX|PTRATIO| B|LSTAT|price|\n", "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n", "|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2| 4.09|1.0|296.0| 15.3| 396.9| 4.98| 24.0|\n", "|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0| 17.8| 396.9| 9.14| 21.6|\n", "|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0| 17.8|392.83| 4.03| 34.7|\n", "|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0| 18.7|394.63| 2.94| 33.4|\n", "|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0| 18.7| 396.9| 5.33| 36.2|\n", "|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0| 18.7|394.12| 5.21| 28.7|\n", "|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0| 15.2| 395.6|12.43| 22.9|\n", "|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0| 15.2| 396.9|19.15| 27.1|\n", "|0.21124|12.5| 7.87| 0.0|0.524|5.631|100.0|6.0821|5.0|311.0| 15.2|386.63|29.93| 16.5|\n", "|0.17004|12.5| 7.87| 0.0|0.524|6.004| 85.9|6.5921|5.0|311.0| 15.2|386.71| 17.1| 18.9|\n", "|0.22489|12.5| 7.87| 0.0|0.524|6.377| 94.3|6.3467|5.0|311.0| 15.2|392.52|20.45| 15.0|\n", "|0.11747|12.5| 7.87| 0.0|0.524|6.009| 82.9|6.2267|5.0|311.0| 15.2| 396.9|13.27| 18.9|\n", "|0.09378|12.5| 7.87| 0.0|0.524|5.889| 39.0|5.4509|5.0|311.0| 15.2| 390.5|15.71| 21.7|\n", "|0.62976| 0.0| 8.14| 0.0|0.538|5.949| 61.8|4.7075|4.0|307.0| 21.0| 396.9| 8.26| 20.4|\n", "|0.63796| 0.0| 8.14| 0.0|0.538|6.096| 84.5|4.4619|4.0|307.0| 21.0|380.02|10.26| 18.2|\n", "|0.62739| 0.0| 8.14| 0.0|0.538|5.834| 56.5|4.4986|4.0|307.0| 21.0|395.62| 8.47| 19.9|\n", "|1.05393| 0.0| 8.14| 0.0|0.538|5.935| 29.3|4.4986|4.0|307.0| 21.0|386.85| 6.58| 23.1|\n", "| 0.7842| 0.0| 8.14| 0.0|0.538| 5.99| 81.7|4.2579|4.0|307.0| 21.0|386.75|14.67| 17.5|\n", "|0.80271| 0.0| 8.14| 0.0|0.538|5.456| 36.6|3.7965|4.0|307.0| 21.0|288.99|11.69| 20.2|\n", "| 0.7258| 0.0| 8.14| 0.0|0.538|5.727| 69.5|3.7965|4.0|307.0| 21.0|390.95|11.28| 18.2|\n", "+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# drop null data \n", "df_boston.na.drop().show()\n", "\n", "# only drop when all column data is null \n", "#df_boston.na.drop(how='all').show()\n", "\n", "# only drop if specfic column data is null \n", "#df_boston.na.drop(subset = ['CRIM']).show()\n", "\n", "\n", "# fill null data \n", "#df_boston.na.fill('FILLED VALUE').show()\n", "\n", "\n", "# fill with avg value\n", "# from pyspark.sql.functions import mean \n", "# mean_val = df_boston.select(mean(df['CRIM'])).collect()\n", "# mean_val_ = mean_val[0][0]\n", "#df_boston.na.fill(mean_val_, ['CRIM']).show()\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 11) Dates and Timestamp " ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "# load timestamp csv \n", "\n", "# load the data\n", "df_timestamp = sqlContext.read\\\n", " .format('com.databricks.spark.csv')\\\n", " .options(header='true', inferschema='true')\\\n", " .load('df_test.csv')\n", " \n", "# rename column \"_c0\" -> \"time\"\n", "df_timestamp = df_timestamp.withColumnRenamed('_c0', 'time')" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------------------+---------------+---------------+---------------+-----------------+\n", "| time| A| B| C| D|\n", "+-------------------+---------------+---------------+---------------+-----------------+\n", "|2013-01-01 00:00:00| 1.74284507345| 0.827612028809|-0.806632282957| 0.306776368559|\n", "|2013-01-02 00:00:00| 1.33922469984|-0.795165583739| -1.77473024596| 0.437705419137|\n", "|2013-01-03 00:00:00| 0.652848767373| -1.54077088745| -1.41021709968| 1.70035456582|\n", "|2013-01-04 00:00:00| 0.247396585844| -1.30234054482|-0.846092769624| 0.699764788482|\n", "|2013-01-05 00:00:00|-0.740530243845| 0.713124246753| -1.16320459805|-0.00284413632768|\n", "|2013-01-06 00:00:00| -1.17173754177| 0.560340996287| 0.946558700023| -0.615189047933|\n", "+-------------------+---------------+---------------+---------------+-----------------+\n", "\n" ] } ], "source": [ "df_timestamp.show()" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "# import timestamp OP methods \n", "from pyspark.sql.functions import (dayofmonth,\n", " hour,\n", " dayofyear,\n", " month, \n", " year,\n", " weekofyear,\n", " format_number,\n", " date_format)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----------------+-----------+----------+-------------------+\n", "|dayofmonth(time)|month(time)|year(time)| time|\n", "+----------------+-----------+----------+-------------------+\n", "| 1| 1| 2013|2013-01-01 00:00:00|\n", "| 2| 1| 2013|2013-01-02 00:00:00|\n", "| 3| 1| 2013|2013-01-03 00:00:00|\n", "| 4| 1| 2013|2013-01-04 00:00:00|\n", "| 5| 1| 2013|2013-01-05 00:00:00|\n", "| 6| 1| 2013|2013-01-06 00:00:00|\n", "+----------------+-----------+----------+-------------------+\n", "\n" ] } ], "source": [ "# transform timestemp via timestamp OP methods \n", "\n", "df_timestamp.select(dayofmonth(df_timestamp['time']),\n", " month(df_timestamp['time']),\n", " year(df_timestamp['time']),\n", " (df_timestamp['time'])).show()" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+------------------+--------------+-------------------+------------------+---------+\n", "|year| avg(A)| avg(B)| avg(C)| avg(D)|avg(year)|\n", "+----+------------------+--------------+-------------------+------------------+---------+\n", "|2013|0.3450078901486666|-0.25619995736|-0.8423863827079999|0.4210946596228866| 2013.0|\n", "+----+------------------+--------------+-------------------+------------------+---------+\n", "\n" ] } ], "source": [ "# get avg value per year \n", "df_timestamp_new = df_timestamp.withColumn('year', year(df_timestamp['time']))\n", "df_timestamp_new.groupby('year').mean().show()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+------------------+\n", "|year| avg(A)|\n", "+----+------------------+\n", "|2013|0.3450078901486666|\n", "+----+------------------+\n", "\n" ] } ], "source": [ "# only select the needed columns \n", "df_timestamp_new.groupby('year').mean().select(['year','avg(A)']).show()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---------------------------------+\n", "|format_number(Average A value, 2)|\n", "+---------------------------------+\n", "| 0.35|\n", "+---------------------------------+\n", "\n", "+----+------------------+\n", "|year| Average A value|\n", "+----+------------------+\n", "|2013|0.3450078901486666|\n", "+----+------------------+\n", "\n" ] } ], "source": [ "# format the outcome \n", "result = df_timestamp_new.groupby('year').mean().select(['year','avg(A)'])\n", "new_result = result.withColumnRenamed(\"avg(A)\", \"Average A value\")\n", "new_result.select(format_number(\"Average A value\", 2)).alias('Average A value').show()\n", "\n", "\n", "new_result.show()" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "# end of Section 8 \n", "# next : Section 9 " ] } ], "metadata": { "kernelspec": { "display_name": "Environment (conda_pyspark_)", "language": "python", "name": "conda_pyspark_" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.5" } }, "nbformat": 4, "nbformat_minor": 2 }