{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Welcome to\n", " ____ __\n", " / __/__ ___ _____/ /__\n", " _\\ \\/ _ \\/ _ `/ __/ '_/\n", " /__ / .__/\\_,_/_/ /_/\\_\\ version 2.4.0\n", " /_/\n", "\n", "Using Python version 2.7.15 (default, Dec 14 2018 19:04:19)\n", "SparkSession available as 'spark'.\n", "local-1560753877291\n" ] } ], "source": [ "import os, sys, glob, datetime\n", "\n", "# specify spark version, python version\n", "spark_home = \"/home/zero/spark-2.4.0-bin-hadoop2.7\" # MODIFY THIS\n", "python_path =\"/apps/anaconda2/bin/python\"\n", "# set environment variables\n", "os.environ['SPARK_HOME'] = spark_home\n", "os.environ['PYSPARK_PYTHON'] = python_path\n", "os.environ['SPARK_LOCAL_IP'] = \"127.0.0.1\"\n", "\n", "def setup_spark_env(app_name):\n", " # set environment variables\n", " spark_python = os.path.join(spark_home, 'python')\n", " py4j = glob.glob(os.path.join(spark_python, 'lib', 'py4j-*.zip'))[0]\n", " sys.path[:0] = [spark_python, py4j]\n", " # specify Spark application parameters\n", " PYSPARK_SUBMIT_ARGS=\"--master local[2]\"\n", "\n", " os.environ['PYSPARK_SUBMIT_ARGS'] = (PYSPARK_SUBMIT_ARGS \n", " + \" --name '%s_%s'\"%(app_name, datetime.datetime.now().strftime(\"%Y%m%d %H:%M\")) \n", " + \" pyspark-shell\") \n", " return\n", "\n", "#\n", "setup_spark_env(\"your_name_test_spark\") # MODIFY THIS\n", "# launching PySpark application\n", "execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))\n", "sc.setLogLevel('ERROR')\n", "print(\"{}\".format(sc.applicationId))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import Row\n", "from pyspark.sql.types import *\n", "import pyspark.sql.functions as sf\n", "from pyspark.sql.window import Window\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create sample data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | product | \n", "category | \n", "revenue | \n", "
---|---|---|---|
0 | \n", "Thin | \n", "Cell phone | \n", "6000 | \n", "
1 | \n", "Normal | \n", "Tablet | \n", "1500 | \n", "
2 | \n", "Mini | \n", "Tablet | \n", "5500 | \n", "
3 | \n", "Ultra thin | \n", "Cell phone | \n", "5000 | \n", "
4 | \n", "Very thin | \n", "Cell phone | \n", "6000 | \n", "
5 | \n", "Big | \n", "Tablet | \n", "2500 | \n", "
6 | \n", "Bendable | \n", "Cell phone | \n", "3000 | \n", "
7 | \n", "Foldable | \n", "Cell phone | \n", "3000 | \n", "
8 | \n", "Pro | \n", "Tablet | \n", "4500 | \n", "
9 | \n", "Pro2 | \n", "Tablet | \n", "6500 | \n", "