{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "-ref http://stackoverflow.com/questions/30940631/how-do-i-setup-pyspark-in-python-3-with-spark-env-sh-template" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Python 3.4.5 :: Continuum Analytics, Inc.\n", "Thu Nov 3 15:00:16 CST 2016\n" ] } ], "source": [ "!python --version\n", "! date" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sc" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'local[*]'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sc.master" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "//anaconda/envs/g_dash/lib/python3.4/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.\n", " \"`IPython.html.widgets` has moved to `ipywidgets`.\", ShimWarning)\n" ] } ], "source": [ "import datetime as dt \n", "import time\n", "import pandas as pd, numpy as np\n", "import pprint\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%pylab inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# * Run SPARK in ipython notebook\n", "\n", "# Index \n", "\n", "- Basics RDD\n", "- map \n", "- Reduce " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create RDD" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# create RDD\n", "\n", "intRDD = sc.parallelize([6,7,1,2,0])\n", "intRDD2 = sc.parallelize([\"apple\", \"car\", \"pan\"])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "pyspark.rdd.RDD" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(intRDD)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[6, 7, 1, 2, 0]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intRDD.collect()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "['apple', 'car', 'pan']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intRDD2.collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# * Transformation\n", "\n", "- Not executed until action has been run " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Map " ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def addone(x):\n", " return (x+1)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[7, 8, 2, 3, 1]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set env variable via export PYSPARK_PYTHON=python3 for setting SPARK run in python 3 \n", "\n", "intRDD.map(addone).collect()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[7, 8, 2, 3, 1]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# or via lammbda \n", "\n", "intRDD.map(lambda x : x + 1 ).collect()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['object: apple', 'object: car', 'object: pan']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# work with words RDD \n", "\n", "intRDD2.map(lambda x : \"object: \"+ x ).collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Filter" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[6, 7]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intRDD.filter(lambda x : x >5).collect()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[6, 7, 2]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intRDD.filter(lambda x : x > 1 & x < 5).collect()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['car']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intRDD2.filter(lambda x : \"ar\" in x ).collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Distinct" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "intRDD3 = sc.parallelize([1,1,2,2,3])\n", "intRDD4 = sc.parallelize([\"apple\",\"apple\",\"car\", \"car\", \"pan\"])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1, 2, 3]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intRDD3.distinct().collect()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['car', 'pan', 'apple']" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intRDD4.distinct().collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Random split" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sRDD = intRDD.randomSplit([0.4,0.6])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[7, 1, 2]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sRDD[0].collect()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[6, 0]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sRDD[1].collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Group by " ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "gRDD = intRDD.groupBy(lambda x : \"even\" if (x%2 ==0 ) else \"odd\").collect()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "list" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(gRDD)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('even', ), ('odd', )]\n" ] } ], "source": [ "print (gRDD)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('even', [0, 2, 6])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gRDD[0][0] , sorted(gRDD[0][1])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('odd', [1, 7])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gRDD[1][0] , sorted(gRDD[1][1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Multiple RDDs" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "intRDD5 = sc.parallelize([3,1,2,2,5])\n", "intRDD6 = sc.parallelize([1,0])\n", "intRDD7 = sc.parallelize([4,5,6])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Union" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[3, 1, 2, 2, 5, 1, 0, 4, 5, 6]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intRDD5.union(intRDD6).union(intRDD7).collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Intersection" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intRDD5.intersection(intRDD6).collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Subtract" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1, 2, 2, 3]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intRDD5.subtract(intRDD7).collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# * Action\n", "\n", "- SPARK will work on RDD right after Action executed " ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "intRDD = sc.parallelize([6,7,1,2,0])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "6" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intRDD.first()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[6, 7, 1]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intRDD.take(3)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[0, 1, 2]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#take orderly 3 elements \n", "\n", "intRDD.takeOrdered(3)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[7, 6, 2]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# first sort RDD orderly, then take first 3 elements \n", "\n", "intRDD.takeOrdered(3, key = lambda x: -x)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Statistics " ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(count: 5, mean: 3.2, stdev: 2.78567765544, max: 7.0, min: 0.0)\n", "0\n", "7\n", "2.78567765544\n", "16\n", "3.2\n" ] } ], "source": [ "print (intRDD.stats())\n", "\n", "print (intRDD.min())\n", "\n", "print (intRDD.max())\n", "\n", "print (intRDD.stdev())\n", "\n", "print (intRDD.sum())\n", "\n", "print (intRDD.mean())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Key-Value Transformation" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 2), (3, 6), (5, 6), (0, 9)]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kvRDD1 = sc.parallelize([(1,2),(3,6),(5,6),(0,9)])\n", "kvRDD1.collect()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1, 3, 5, 0]" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get keys \n", "kvRDD1.keys().collect()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[2, 6, 6, 9]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get values \n", "kvRDD1.values().collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Filter with key" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 2), (0, 9)]" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kvRDD1.filter(lambda keyvalue : keyvalue[0] < 3 ).collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Filter with Value" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 2)]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kvRDD1.filter(lambda keyvalue : keyvalue[1] < 3 ).collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sort by key" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(0, 9), (1, 2), (3, 6), (5, 6)]" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kvRDD1.sortByKey(ascending=True).collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Reduce by key **" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 2), (1, 3), (5, 6), (3, 9), (3, 1)]" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kvRDD2 = sc.parallelize([(1,2),(1,3),(5,6),(3,9),(3,1)])\n", "kvRDD2.collect()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 5), (5, 6), (3, 10)]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# (1, 2+3) , (5,6), (3, 9+1)\n", "# sum up values of (key,value) with same key \n", "\n", "kvRDD2.reduceByKey(lambda x,y : x+y).collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Multi - Key-Value Transformation" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": true }, "outputs": [], "source": [ "kvRDD3 = sc.parallelize([(1,2),(1,3),(5,6),(3,9),(3,1)])\n", "kvRDD4 = sc.parallelize([(3,8)])" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(3, (9, 8)), (3, (1, 8))]" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# key - value RDD join\n", "\n", "kvRDD3.join(kvRDD4).collect()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, (2, None)), (1, (3, None)), (3, (9, 8)), (3, (1, 8)), (5, (6, None))]" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# key - value left OUTER RDD join\n", "\n", "kvRDD3.leftOuterJoin(kvRDD4).collect()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(3, (9, 8)), (3, (1, 8))]" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# key - value left OUTER RDD join\n", "\n", "kvRDD3.rightOuterJoin(kvRDD4).collect()" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 2), (1, 3), (5, 6)]" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# key - value subtract by key \n", "\n", "kvRDD3.subtractByKey(kvRDD4).collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Key-Value Action" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(1, 2)" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kvRDD3.first()" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kvRDD3.first()[0]" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kvRDD3.first()[1]" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 2), (1, 3)]" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kvRDD3.take(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Count by key" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "defaultdict(int, {1: 2, 3: 2, 5: 1})" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kvRDD3.countByKey()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Collect as map" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{1: 3, 3: 1, 5: 6}" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "KV = kvRDD3.collectAsMap()\n", "KV" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "dict" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(KV)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[6]" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# key - value lookup\n", "\n", "kvRDD3.lookup(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Broadcast" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [], "source": [ "kvFruit = sc.parallelize([(1,\"apple\"),(2,\"banana\"),(3,\"peach\"),(4,\"grape\")])" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{1: 'apple', 2: 'banana', 3: 'peach', 4: 'grape'}" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fruitMap = kvFruit.collectAsMap()\n", "fruitMap" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Broadcast \"bcFruitMap\"\n", "\n", "bcFruitMap = sc.broadcast(fruitMap)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[2, 4, 1, 3]" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fruitIds = sc.parallelize([2,4,1,3])\n", "fruitIds.collect()" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['banana', 'grape', 'apple', 'peach']" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# using broadcast dict \n", "\n", "fruitNames = fruitIds.map(lambda x : bcFruitMap.value[x]).collect()\n", "\n", "fruitNames" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Accumulator" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# RDD Persistence " ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ** Case : WordCount" ] }, { "cell_type": "code", "execution_count": 110, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# fruit is a txt file with data like this :\n", "\"\"\"\n", "apple apple orange \n", "banana grape grape \n", "\"\"\"\n", "\n", "textfile = sc.textFile(\"fruit.txt\")" ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['apple apple orange', 'banana grape grape']" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "textfile.collect()" ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['apple', 'apple', 'orange'], ['banana', 'grape', 'grape']]" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# map \n", "\n", "stringRDD = textfile.map(lambda line : line.split(\" \"))\n", "stringRDD.collect()" ] }, { "cell_type": "code", "execution_count": 113, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['apple', 'apple', 'orange', 'banana', 'grape', 'grape']" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# flatmap \n", "\n", "stringRDD = textfile.flatMap(lambda line : line.split(\" \"))\n", "stringRDD.collect()" ] }, { "cell_type": "code", "execution_count": 115, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('apple', 1),\n", " ('apple', 1),\n", " ('orange', 1),\n", " ('banana', 1),\n", " ('grape', 1),\n", " ('grape', 1)]" ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# key-value pair by Map\n", "\n", "stringRDD.map(lambda word : (word,1)).collect()" ] }, { "cell_type": "code", "execution_count": 116, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('orange', 1), ('banana', 1), ('apple', 2), ('grape', 2)]" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# word count by reduceByKey\n", "\n", "stringRDD.map(lambda word : (word,1)).reduceByKey(lambda x,y : x+y).collect()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.5" } }, "nbformat": 4, "nbformat_minor": 1 }