{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Welcome to\n", " ____ __\n", " / __/__ ___ _____/ /__\n", " _\\ \\/ _ \\/ _ `/ __/ '_/\n", " /__ / .__/\\_,_/_/ /_/\\_\\ version 2.4.0\n", " /_/\n", "\n", "Using Python version 3.6.4 (default, Jan 16 2018 18:10:19)\n", "SparkSession available as 'spark'.\n", "local-1557023299182\n" ] } ], "source": [ "import os, sys, glob, datetime\n", "\n", "# specify spark version, python version\n", "spark_home = \"/home/zero/spark-2.4.0-bin-hadoop2.7\" # MODIFY THIS\n", "python_path=\"/apps/anaconda3/bin/python\"\n", "# set environment variables\n", "os.environ['SPARK_HOME'] = spark_home\n", "os.environ['PYSPARK_PYTHON'] = python_path\n", "os.environ['SPARK_LOCAL_IP'] = \"127.0.0.1\"\n", "\n", "def setup_spark_env(app_name):\n", " # set environment variables\n", " spark_python = os.path.join(spark_home, 'python')\n", " py4j = glob.glob(os.path.join(spark_python, 'lib', 'py4j-*.zip'))[0]\n", " sys.path[:0] = [spark_python, py4j]\n", " # specify Spark application parameters\n", " PYSPARK_SUBMIT_ARGS=\"--master local[2]\"\n", "\n", " os.environ['PYSPARK_SUBMIT_ARGS'] = (PYSPARK_SUBMIT_ARGS \n", " + \" --name '%s_%s'\"%(app_name, datetime.datetime.now().strftime(\"%Y%m%d %H:%M\")) \n", " + \" pyspark-shell\") \n", " return\n", "\n", "#\n", "setup_spark_env(\"your_spark_process_name\") # MODIFY THIS\n", "# launching PySpark application\n", "# execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))\n", "filename=os.path.join(spark_home, 'python/pyspark/shell.py')\n", "exec(compile(open(filename, \"rb\").read(), filename, 'exec'))\n", "sc.setLogLevel('ERROR')\n", "print(\"{}\".format(sc.applicationId))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import functions as sf\n", "from pyspark.sql import Row\n", "from pyspark.sql.types import *\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import os, math, subprocess\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "# some settings for displaying Pandas results\n", "pd.set_option('display.width', 2000)\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.precision', 4)\n", "pd.set_option('display.max_colwidth', -1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- SK_ID_CURR: string (nullable = true)\n", " |-- TARGET: string (nullable = true)\n", " |-- NAME_CONTRACT_TYPE: string (nullable = true)\n", " |-- CODE_GENDER: string (nullable = true)\n", " |-- FLAG_OWN_CAR: string (nullable = true)\n", " |-- FLAG_OWN_REALTY: string (nullable = true)\n", " |-- CNT_CHILDREN: string (nullable = true)\n", " |-- AMT_INCOME_TOTAL: string (nullable = true)\n", " |-- AMT_CREDIT: string (nullable = true)\n", " |-- AMT_ANNUITY: string (nullable = true)\n", " |-- AMT_GOODS_PRICE: string (nullable = true)\n", " |-- NAME_TYPE_SUITE: string (nullable = true)\n", " |-- NAME_INCOME_TYPE: string (nullable = true)\n", " |-- NAME_EDUCATION_TYPE: string (nullable = true)\n", " |-- NAME_FAMILY_STATUS: string (nullable = true)\n", " |-- NAME_HOUSING_TYPE: string (nullable = true)\n", " |-- REGION_POPULATION_RELATIVE: string (nullable = true)\n", " |-- DAYS_BIRTH: string (nullable = true)\n", " |-- DAYS_EMPLOYED: string (nullable = true)\n", " |-- DAYS_REGISTRATION: string (nullable = true)\n", " |-- DAYS_ID_PUBLISH: string (nullable = true)\n", " |-- OWN_CAR_AGE: string (nullable = true)\n", " |-- FLAG_MOBIL: string (nullable = true)\n", " |-- FLAG_EMP_PHONE: string (nullable = true)\n", " |-- FLAG_WORK_PHONE: string (nullable = true)\n", " |-- FLAG_CONT_MOBILE: string (nullable = true)\n", " |-- FLAG_PHONE: string (nullable = true)\n", " |-- FLAG_EMAIL: string (nullable = true)\n", " |-- OCCUPATION_TYPE: string (nullable = true)\n", " |-- CNT_FAM_MEMBERS: string (nullable = true)\n", " |-- REGION_RATING_CLIENT: string (nullable = true)\n", " |-- REGION_RATING_CLIENT_W_CITY: string (nullable = true)\n", " |-- WEEKDAY_APPR_PROCESS_START: string (nullable = true)\n", " |-- HOUR_APPR_PROCESS_START: string (nullable = true)\n", " |-- REG_REGION_NOT_LIVE_REGION: string (nullable = true)\n", " |-- REG_REGION_NOT_WORK_REGION: string (nullable = true)\n", " |-- LIVE_REGION_NOT_WORK_REGION: string (nullable = true)\n", " |-- REG_CITY_NOT_LIVE_CITY: string (nullable = true)\n", " |-- REG_CITY_NOT_WORK_CITY: string (nullable = true)\n", " |-- LIVE_CITY_NOT_WORK_CITY: string (nullable = true)\n", " |-- ORGANIZATION_TYPE: string (nullable = true)\n", " |-- EXT_SOURCE_1: string (nullable = true)\n", " |-- EXT_SOURCE_2: string (nullable = true)\n", " |-- EXT_SOURCE_3: string (nullable = true)\n", " |-- APARTMENTS_AVG: string (nullable = true)\n", " |-- BASEMENTAREA_AVG: string (nullable = true)\n", " |-- YEARS_BEGINEXPLUATATION_AVG: string (nullable = true)\n", " |-- YEARS_BUILD_AVG: string (nullable = true)\n", " |-- COMMONAREA_AVG: string (nullable = true)\n", " |-- ELEVATORS_AVG: string (nullable = true)\n", " |-- ENTRANCES_AVG: string (nullable = true)\n", " |-- FLOORSMAX_AVG: string (nullable = true)\n", " |-- FLOORSMIN_AVG: string (nullable = true)\n", " |-- LANDAREA_AVG: string (nullable = true)\n", " |-- LIVINGAPARTMENTS_AVG: string (nullable = true)\n", " |-- LIVINGAREA_AVG: string (nullable = true)\n", " |-- NONLIVINGAPARTMENTS_AVG: string (nullable = true)\n", " |-- NONLIVINGAREA_AVG: string (nullable = true)\n", " |-- APARTMENTS_MODE: string (nullable = true)\n", " |-- BASEMENTAREA_MODE: string (nullable = true)\n", " |-- YEARS_BEGINEXPLUATATION_MODE: string (nullable = true)\n", " |-- YEARS_BUILD_MODE: string (nullable = true)\n", " |-- COMMONAREA_MODE: string (nullable = true)\n", " |-- ELEVATORS_MODE: string (nullable = true)\n", " |-- ENTRANCES_MODE: string (nullable = true)\n", " |-- FLOORSMAX_MODE: string (nullable = true)\n", " |-- FLOORSMIN_MODE: string (nullable = true)\n", " |-- LANDAREA_MODE: string (nullable = true)\n", " |-- LIVINGAPARTMENTS_MODE: string (nullable = true)\n", " |-- LIVINGAREA_MODE: string (nullable = true)\n", " |-- NONLIVINGAPARTMENTS_MODE: string (nullable = true)\n", " |-- NONLIVINGAREA_MODE: string (nullable = true)\n", " |-- APARTMENTS_MEDI: string (nullable = true)\n", " |-- BASEMENTAREA_MEDI: string (nullable = true)\n", " |-- YEARS_BEGINEXPLUATATION_MEDI: string (nullable = true)\n", " |-- YEARS_BUILD_MEDI: string (nullable = true)\n", " |-- COMMONAREA_MEDI: string (nullable = true)\n", " |-- ELEVATORS_MEDI: string (nullable = true)\n", " |-- ENTRANCES_MEDI: string (nullable = true)\n", " |-- FLOORSMAX_MEDI: string (nullable = true)\n", " |-- FLOORSMIN_MEDI: string (nullable = true)\n", " |-- LANDAREA_MEDI: string (nullable = true)\n", " |-- LIVINGAPARTMENTS_MEDI: string (nullable = true)\n", " |-- LIVINGAREA_MEDI: string (nullable = true)\n", " |-- NONLIVINGAPARTMENTS_MEDI: string (nullable = true)\n", " |-- NONLIVINGAREA_MEDI: string (nullable = true)\n", " |-- FONDKAPREMONT_MODE: string (nullable = true)\n", " |-- HOUSETYPE_MODE: string (nullable = true)\n", " |-- TOTALAREA_MODE: string (nullable = true)\n", " |-- WALLSMATERIAL_MODE: string (nullable = true)\n", " |-- EMERGENCYSTATE_MODE: string (nullable = true)\n", " |-- OBS_30_CNT_SOCIAL_CIRCLE: string (nullable = true)\n", " |-- DEF_30_CNT_SOCIAL_CIRCLE: string (nullable = true)\n", " |-- OBS_60_CNT_SOCIAL_CIRCLE: string (nullable = true)\n", " |-- DEF_60_CNT_SOCIAL_CIRCLE: string (nullable = true)\n", " |-- DAYS_LAST_PHONE_CHANGE: string (nullable = true)\n", " |-- FLAG_DOCUMENT_2: string (nullable = true)\n", " |-- FLAG_DOCUMENT_3: string (nullable = true)\n", " |-- FLAG_DOCUMENT_4: string (nullable = true)\n", " |-- FLAG_DOCUMENT_5: string (nullable = true)\n", " |-- FLAG_DOCUMENT_6: string (nullable = true)\n", " |-- FLAG_DOCUMENT_7: string (nullable = true)\n", " |-- FLAG_DOCUMENT_8: string (nullable = true)\n", " |-- FLAG_DOCUMENT_9: string (nullable = true)\n", " |-- FLAG_DOCUMENT_10: string (nullable = true)\n", " |-- FLAG_DOCUMENT_11: string (nullable = true)\n", " |-- FLAG_DOCUMENT_12: string (nullable = true)\n", " |-- FLAG_DOCUMENT_13: string (nullable = true)\n", " |-- FLAG_DOCUMENT_14: string (nullable = true)\n", " |-- FLAG_DOCUMENT_15: string (nullable = true)\n", " |-- FLAG_DOCUMENT_16: string (nullable = true)\n", " |-- FLAG_DOCUMENT_17: string (nullable = true)\n", " |-- FLAG_DOCUMENT_18: string (nullable = true)\n", " |-- FLAG_DOCUMENT_19: string (nullable = true)\n", " |-- FLAG_DOCUMENT_20: string (nullable = true)\n", " |-- FLAG_DOCUMENT_21: string (nullable = true)\n", " |-- AMT_REQ_CREDIT_BUREAU_HOUR: string (nullable = true)\n", " |-- AMT_REQ_CREDIT_BUREAU_DAY: string (nullable = true)\n", " |-- AMT_REQ_CREDIT_BUREAU_WEEK: string (nullable = true)\n", " |-- AMT_REQ_CREDIT_BUREAU_MON: string (nullable = true)\n", " |-- AMT_REQ_CREDIT_BUREAU_QRT: string (nullable = true)\n", " |-- AMT_REQ_CREDIT_BUREAU_YEAR: string (nullable = true)\n", "\n", "Rows: 307511, Cols: 122\n" ] } ], "source": [ "# load data\n", "data_path = \"home-credit-default-risk/application_train.csv\"\n", "df_data = sqlContext.read.format(\"csv\").option(\"header\", \"true\").load(data_path)\n", "df_data.printSchema()\n", "print(\"Rows: {}, Cols: {}\".format(df_data.count(), len(df_data.columns)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# SELECT, WHERE, DISTINCT, LIMIT" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " SELECT *\n", " FROM pdf_data\n", " LIMIT 3\n", "\n" ] }, { "data": { "text/plain": [ "[Row(SK_ID_CURR='100002', TARGET='1', NAME_CONTRACT_TYPE='Cash loans', CODE_GENDER='M', FLAG_OWN_CAR='N', FLAG_OWN_REALTY='Y', CNT_CHILDREN='0', AMT_INCOME_TOTAL='202500.0', AMT_CREDIT='406597.5', AMT_ANNUITY='24700.5', AMT_GOODS_PRICE='351000.0', NAME_TYPE_SUITE='Unaccompanied', NAME_INCOME_TYPE='Working', NAME_EDUCATION_TYPE='Secondary / secondary special', NAME_FAMILY_STATUS='Single / not married', NAME_HOUSING_TYPE='House / apartment', REGION_POPULATION_RELATIVE='0.018801', DAYS_BIRTH='-9461', DAYS_EMPLOYED='-637', DAYS_REGISTRATION='-3648.0', DAYS_ID_PUBLISH='-2120', OWN_CAR_AGE=None, FLAG_MOBIL='1', FLAG_EMP_PHONE='1', FLAG_WORK_PHONE='0', FLAG_CONT_MOBILE='1', FLAG_PHONE='1', FLAG_EMAIL='0', OCCUPATION_TYPE='Laborers', CNT_FAM_MEMBERS='1.0', REGION_RATING_CLIENT='2', REGION_RATING_CLIENT_W_CITY='2', WEEKDAY_APPR_PROCESS_START='WEDNESDAY', HOUR_APPR_PROCESS_START='10', REG_REGION_NOT_LIVE_REGION='0', REG_REGION_NOT_WORK_REGION='0', LIVE_REGION_NOT_WORK_REGION='0', REG_CITY_NOT_LIVE_CITY='0', REG_CITY_NOT_WORK_CITY='0', LIVE_CITY_NOT_WORK_CITY='0', ORGANIZATION_TYPE='Business Entity Type 3', EXT_SOURCE_1='0.08303696739132256', EXT_SOURCE_2='0.2629485927471776', EXT_SOURCE_3='0.13937578009978951', APARTMENTS_AVG='0.0247', BASEMENTAREA_AVG='0.0369', YEARS_BEGINEXPLUATATION_AVG='0.9722', YEARS_BUILD_AVG='0.6192', COMMONAREA_AVG='0.0143', ELEVATORS_AVG='0.0', ENTRANCES_AVG='0.069', FLOORSMAX_AVG='0.0833', FLOORSMIN_AVG='0.125', LANDAREA_AVG='0.0369', LIVINGAPARTMENTS_AVG='0.0202', LIVINGAREA_AVG='0.019', NONLIVINGAPARTMENTS_AVG='0.0', NONLIVINGAREA_AVG='0.0', APARTMENTS_MODE='0.0252', BASEMENTAREA_MODE='0.0383', YEARS_BEGINEXPLUATATION_MODE='0.9722', YEARS_BUILD_MODE='0.6341', COMMONAREA_MODE='0.0144', ELEVATORS_MODE='0.0', ENTRANCES_MODE='0.069', FLOORSMAX_MODE='0.0833', FLOORSMIN_MODE='0.125', LANDAREA_MODE='0.0377', LIVINGAPARTMENTS_MODE='0.022', LIVINGAREA_MODE='0.0198', NONLIVINGAPARTMENTS_MODE='0.0', NONLIVINGAREA_MODE='0.0', APARTMENTS_MEDI='0.025', BASEMENTAREA_MEDI='0.0369', YEARS_BEGINEXPLUATATION_MEDI='0.9722', YEARS_BUILD_MEDI='0.6243', COMMONAREA_MEDI='0.0144', ELEVATORS_MEDI='0.0', ENTRANCES_MEDI='0.069', FLOORSMAX_MEDI='0.0833', FLOORSMIN_MEDI='0.125', LANDAREA_MEDI='0.0375', LIVINGAPARTMENTS_MEDI='0.0205', LIVINGAREA_MEDI='0.0193', NONLIVINGAPARTMENTS_MEDI='0.0', NONLIVINGAREA_MEDI='0.0', FONDKAPREMONT_MODE='reg oper account', HOUSETYPE_MODE='block of flats', TOTALAREA_MODE='0.0149', WALLSMATERIAL_MODE='Stone, brick', EMERGENCYSTATE_MODE='No', OBS_30_CNT_SOCIAL_CIRCLE='2.0', DEF_30_CNT_SOCIAL_CIRCLE='2.0', OBS_60_CNT_SOCIAL_CIRCLE='2.0', DEF_60_CNT_SOCIAL_CIRCLE='2.0', DAYS_LAST_PHONE_CHANGE='-1134.0', FLAG_DOCUMENT_2='0', FLAG_DOCUMENT_3='1', FLAG_DOCUMENT_4='0', FLAG_DOCUMENT_5='0', FLAG_DOCUMENT_6='0', FLAG_DOCUMENT_7='0', FLAG_DOCUMENT_8='0', FLAG_DOCUMENT_9='0', FLAG_DOCUMENT_10='0', FLAG_DOCUMENT_11='0', FLAG_DOCUMENT_12='0', FLAG_DOCUMENT_13='0', FLAG_DOCUMENT_14='0', FLAG_DOCUMENT_15='0', FLAG_DOCUMENT_16='0', FLAG_DOCUMENT_17='0', FLAG_DOCUMENT_18='0', FLAG_DOCUMENT_19='0', FLAG_DOCUMENT_20='0', FLAG_DOCUMENT_21='0', AMT_REQ_CREDIT_BUREAU_HOUR='0.0', AMT_REQ_CREDIT_BUREAU_DAY='0.0', AMT_REQ_CREDIT_BUREAU_WEEK='0.0', AMT_REQ_CREDIT_BUREAU_MON='0.0', AMT_REQ_CREDIT_BUREAU_QRT='0.0', AMT_REQ_CREDIT_BUREAU_YEAR='1.0')]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"\"\"\n", " SELECT *\n", " FROM pdf_data\n", " LIMIT 3\n", "\"\"\")\n", "df_data.limit(3).take(1)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " SELECT NAME_CONTRACT_TYPE\n", " FROM pdf_data\n", " WHERE CODE_GENDER = 'M'\n", "\n", "+------------------+\n", "|NAME_CONTRACT_TYPE|\n", "+------------------+\n", "| Cash loans|\n", "| Revolving loans|\n", "| Cash loans|\n", "| Cash loans|\n", "| Cash loans|\n", "+------------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "print(\"\"\"\n", " SELECT NAME_CONTRACT_TYPE\n", " FROM pdf_data\n", " WHERE CODE_GENDER = 'M'\n", "\"\"\")\n", "df_data.where(\"CODE_GENDER = 'M'\").select(\"NAME_CONTRACT_TYPE\").show(5)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " SELECT DISTINCT NAME_CONTRACT_TYPE\n", " FROM pdf_data\n", "\n", "+------------------+\n", "|NAME_CONTRACT_TYPE|\n", "+------------------+\n", "| Revolving loans|\n", "| Cash loans|\n", "+------------------+\n", "\n" ] } ], "source": [ "print(\"\"\"\n", " SELECT DISTINCT NAME_CONTRACT_TYPE\n", " FROM pdf_data\n", "\"\"\")\n", "df_data.select(\"NAME_CONTRACT_TYPE\").distinct().show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# SELECT with multiple conditions" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " SELECT NAME_INCOME_TYPE, CODE_GENDER, AMT_INCOME_TOTAL\n", " FROM pdf_data\n", " WHERE CODE_GENDER = 'M' AND AMT_INCOME_TOTAL > 200000.0\n", "\n", "+--------------------+-----------+----------------+\n", "| NAME_INCOME_TYPE|CODE_GENDER|AMT_INCOME_TOTAL|\n", "+--------------------+-----------+----------------+\n", "| Working| M| 202500.0|\n", "| State servant| M| 360000.0|\n", "| Working| M| 225000.0|\n", "| State servant| M| 270000.0|\n", "|Commercial associate| M| 360000.0|\n", "+--------------------+-----------+----------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "print(\"\"\"\n", " SELECT NAME_INCOME_TYPE, CODE_GENDER, AMT_INCOME_TOTAL\n", " FROM pdf_data\n", " WHERE CODE_GENDER = 'M' AND AMT_INCOME_TOTAL > 200000.0\n", "\"\"\")\n", "\n", "(df_data.where(\"CODE_GENDER = 'M' and AMT_INCOME_TOTAL > 200000.0\")\n", " .select(\"NAME_INCOME_TYPE\", \"CODE_GENDER\", \"AMT_INCOME_TOTAL\")).show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ORDER BY" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " SELECT NAME_INCOME_TYPE, AMT_INCOME_TOTAL\n", " FROM pdf_data\n", " ORDER BY AMT_INCOME_TOTAL\n", "\n", "+--------------------+----------------+\n", "| NAME_INCOME_TYPE|AMT_INCOME_TOTAL|\n", "+--------------------+----------------+\n", "| Working| 100071.0|\n", "|Commercial associate| 100089.0|\n", "| Working| 100125.0|\n", "|Commercial associate| 1001826.0|\n", "| State servant| 100278.0|\n", "+--------------------+----------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "print(\"\"\"\n", " SELECT NAME_INCOME_TYPE, AMT_INCOME_TOTAL\n", " FROM pdf_data\n", " ORDER BY AMT_INCOME_TOTAL\n", "\"\"\")\n", "df_data.select(\"NAME_INCOME_TYPE\", \"AMT_INCOME_TOTAL\").orderBy([\"AMT_INCOME_TOTAL\"]).show(5)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " SELECT NAME_INCOME_TYPE, AMT_INCOME_TOTAL\n", " FROM pdf_data\n", " ORDER BY AMT_INCOME_TOTAL DESC\n", "\n", "+----------------+----------------+\n", "|NAME_INCOME_TYPE|AMT_INCOME_TOTAL|\n", "+----------------+----------------+\n", "| Pensioner| 99900.0|\n", "| Pensioner| 99900.0|\n", "| Pensioner| 99900.0|\n", "| Pensioner| 99900.0|\n", "| Pensioner| 99900.0|\n", "+----------------+----------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "print(\"\"\"\n", " SELECT NAME_INCOME_TYPE, AMT_INCOME_TOTAL\n", " FROM pdf_data\n", " ORDER BY AMT_INCOME_TOTAL DESC\n", "\"\"\")\n", "df_data.select(\"NAME_INCOME_TYPE\", \"AMT_INCOME_TOTAL\").orderBy([\"AMT_INCOME_TOTAL\"], ascending=[0]).show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# IN… NOT IN" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " SELECT NAME_INCOME_TYPE, AMT_INCOME_TOTAL\n", " FROM pdf_data\n", " WHERE SK_ID_CURR IN (100002, 100010, 100011)\n", "\n", "+----------------+----------------+\n", "|NAME_INCOME_TYPE|AMT_INCOME_TOTAL|\n", "+----------------+----------------+\n", "| Working| 202500.0|\n", "| State servant| 360000.0|\n", "| Pensioner| 112500.0|\n", "+----------------+----------------+\n", "\n" ] } ], "source": [ "print(\"\"\"\n", " SELECT NAME_INCOME_TYPE, AMT_INCOME_TOTAL\n", " FROM pdf_data\n", " WHERE SK_ID_CURR IN (100002, 100010, 100011)\n", "\"\"\")\n", "\n", "(df_data.select(\"NAME_INCOME_TYPE\", \"AMT_INCOME_TOTAL\")\n", " .where(\"SK_ID_CURR IN {}\".format((100002, 100010, 100011)))).show()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " SELECT NAME_INCOME_TYPE, AMT_INCOME_TOTAL\n", " FROM pdf_data\n", " WHERE SK_ID_CURR NOT IN (100002, 100010, 100011)\n", "\n", "+----------------+----------------+\n", "|NAME_INCOME_TYPE|AMT_INCOME_TOTAL|\n", "+----------------+----------------+\n", "| State servant| 270000.0|\n", "| Working| 67500.0|\n", "| Working| 135000.0|\n", "| Working| 121500.0|\n", "| State servant| 99000.0|\n", "+----------------+----------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "print(\"\"\"\n", " SELECT NAME_INCOME_TYPE, AMT_INCOME_TOTAL\n", " FROM pdf_data\n", " WHERE SK_ID_CURR NOT IN (100002, 100010, 100011)\n", "\"\"\")\n", "\n", "(df_data.select(\"NAME_INCOME_TYPE\", \"AMT_INCOME_TOTAL\")\n", " .where(\"SK_ID_CURR NOT IN {}\".format((100002, 100010, 100011)))).show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# GROUP BY, COUNT, ORDER BY" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " SELECT CODE_GENDER, COUNT(TARGET)\n", " FROM pdf_data\n", " GROUP BY CODE_GENDER\n", " ORDER BY NUM_TARGET\n", "\n", "+-----------+------+\n", "|CODE_GENDER| count|\n", "+-----------+------+\n", "| XNA| 4|\n", "| M|105059|\n", "| F|202448|\n", "+-----------+------+\n", "\n" ] } ], "source": [ "print(\"\"\"\n", " SELECT CODE_GENDER, COUNT(TARGET)\n", " FROM pdf_data\n", " GROUP BY CODE_GENDER\n", " ORDER BY NUM_TARGET\n", "\"\"\")\n", "\n", "df_data.groupBy(\"CODE_GENDER\").agg(sf.count(sf.col(\"TARGET\")).alias(\"count\")).orderBy(\"count\").show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Aggregate functions (MIN, MAX, MEAN)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " SELECT MAX(AMT_INCOME_TOTAL), MIN(AMT_INCOME_TOTAL), MEAN(AMT_INCOME_TOTAL)\n", " FROM pdf_data\n", "\n", "+---------------------+---------------------+-------------------------------------+\n", "|max(AMT_INCOME_TOTAL)|min(AMT_INCOME_TOTAL)|avg(CAST(AMT_INCOME_TOTAL AS DOUBLE))|\n", "+---------------------+---------------------+-------------------------------------+\n", "| 99900.0| 100071.0| 168797.91929698453|\n", "+---------------------+---------------------+-------------------------------------+\n", "\n" ] } ], "source": [ "print(\"\"\"\n", " SELECT MAX(AMT_INCOME_TOTAL), MIN(AMT_INCOME_TOTAL), MEAN(AMT_INCOME_TOTAL)\n", " FROM pdf_data\n", "\"\"\")\n", "\n", "df_data.selectExpr([\"MAX(AMT_INCOME_TOTAL)\", \n", " \"MIN(AMT_INCOME_TOTAL)\", \n", " \"MEAN(AMT_INCOME_TOTAL)\"]).show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# JOIN\n", "\n", "- (INNER) JOIN: Returns records that have matching values in both tables\n", "- LEFT (OUTER) JOIN: Return all records from the left table, and the matched records from the right table\n", "- RIGHT (OUTER) JOIN: Return all records from the right table, and the matched records from the left table\n", "- FULL (OUTER) JOIN: Return all records when there is a match in either left or right table" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " SELECT *\n", " FROM df1\n", " INNER JOIN df2\n", " ON df1.SK_ID_CURR = df2.SK_ID_CURR\n", "\n", "+----------+----------------+-----------+------------+\n", "|SK_ID_CURR|AMT_INCOME_TOTAL|CODE_GENDER|FLAG_OWN_CAR|\n", "+----------+----------------+-----------+------------+\n", "| 100002| 202500.0| M| N|\n", "| 100003| 270000.0| F| N|\n", "| 100004| 67500.0| M| Y|\n", "| 100006| 135000.0| F| N|\n", "| 100007| 121500.0| M| N|\n", "| 100008| 99000.0| M| N|\n", "| 100009| 171000.0| F| Y|\n", "| 100010| 360000.0| M| Y|\n", "| 100011| 112500.0| F| N|\n", "| 100012| 135000.0| M| N|\n", "| 100014| 112500.0| F| N|\n", "| 100015| 38419.155| F| N|\n", "| 100016| 67500.0| F| N|\n", "| 100017| 225000.0| M| Y|\n", "| 100018| 189000.0| F| N|\n", "| 100019| 157500.0| M| Y|\n", "| 100020| 108000.0| M| N|\n", "| 100021| 81000.0| F| N|\n", "| 100022| 112500.0| F| N|\n", "| 100023| 90000.0| F| N|\n", "+----------+----------------+-----------+------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "print(\"\"\"\n", " SELECT *\n", " FROM df1\n", " INNER JOIN df2\n", " ON df1.SK_ID_CURR = df2.SK_ID_CURR\n", "\"\"\")\n", "\n", "df1 = df_data.select(\"SK_ID_CURR\", \"AMT_INCOME_TOTAL\")\n", "df2 = df_data.select(\"SK_ID_CURR\", \"CODE_GENDER\", \"FLAG_OWN_CAR\")\n", "df1.join(df2, on=\"SK_ID_CURR\", how=\"inner\").show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# UNION ALL and UNION" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " SELECT * FROM df1\n", " UNION ALL\n", " SELECT * FROM df2\n", "\n", "Union all: 615022\n", "Union: 6\n" ] } ], "source": [ "print(\"\"\"\n", " SELECT * FROM df1\n", " UNION ALL\n", " SELECT * FROM df2\n", "\"\"\")\n", "\n", "df1 = df_data.select(\"CODE_GENDER\", \"FLAG_OWN_CAR\")\n", "df2 = df_data.select(\"CODE_GENDER\", \"FLAG_OWN_CAR\")\n", "\n", "print(\"Union all:\", df1.unionAll(df2).count())\n", "print(\"Union:\", df1.unionAll(df2).distinct().count())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Insert, delete, update\n", "\n", "Spark data frame is immutable" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }