{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## This notebook is part of Hadoop and Spark training delivered by CERN IT\n" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### SPARK DataFrame Hands-On Lab\n", "Contact: Luca.Canali@cern.ch\n", "\n", "### Objective: Perform Basic DataFrame Operations\n", "1. Creating DataFrames\n", "2. Select columns\n", "3. Add, rename and drop columns\n", "4. Filtering rows\n", "5. Aggregations" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Exercises and solutions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Reminder: documentation at \n", "https://spark.apache.org/docs/latest/api/python/index.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create the SparkSession\n", "# and read the dataset\n", "\n", "from pyspark.sql import SparkSession\n", "\n", "spark = SparkSession.builder \\\n", " .master(\"local[*]\") \\\n", " .appName(\"DataFrame HandsOn 1\") \\\n", " .config(\"spark.ui.showConsoleProgress\",\"false\") \\\n", " .getOrCreate()\n", "\n", "online_retail_schema=\"InvoiceNo int, StockCode string, Description string, Quantity int,\\\n", "InvoiceDate timestamp,UnitPrice float,CustomerId int, Country string\"\n", "\n", "df = spark.read \\\n", " .option(\"header\", \"true\") \\\n", " .option(\"timestampFormat\", \"M/d/yyyy H:m\")\\\n", " .csv(\"../data/online-retail-dataset.csv.gz\",\n", " schema=online_retail_schema)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
SparkSession - in-memory
\n", " \n", "SparkContext
\n", "\n", " \n", "\n", "v3.3.1local[*]DataFrame HandsOn 1| \n", " | InvoiceNo | \n", "StockCode | \n", "Description | \n", "Quantity | \n", "InvoiceDate | \n", "UnitPrice | \n", "CustomerId | \n", "Country | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "536365 | \n", "85123A | \n", "WHITE HANGING HEART T-LIGHT HOLDER | \n", "6 | \n", "2010-12-01 08:26:00 | \n", "2.55 | \n", "17850 | \n", "United Kingdom | \n", "
| 1 | \n", "536365 | \n", "71053 | \n", "WHITE METAL LANTERN | \n", "6 | \n", "2010-12-01 08:26:00 | \n", "3.39 | \n", "17850 | \n", "United Kingdom | \n", "
| 2 | \n", "536365 | \n", "84406B | \n", "CREAM CUPID HEARTS COAT HANGER | \n", "8 | \n", "2010-12-01 08:26:00 | \n", "2.75 | \n", "17850 | \n", "United Kingdom | \n", "
| 3 | \n", "536365 | \n", "84029G | \n", "KNITTED UNION FLAG HOT WATER BOTTLE | \n", "6 | \n", "2010-12-01 08:26:00 | \n", "3.39 | \n", "17850 | \n", "United Kingdom | \n", "
| 4 | \n", "536365 | \n", "84029E | \n", "RED WOOLLY HOTTIE WHITE HEART. | \n", "6 | \n", "2010-12-01 08:26:00 | \n", "3.39 | \n", "17850 | \n", "United Kingdom | \n", "