{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from pyspark import SparkContext\n", "\n", "if 'sc' not in locals():\n", " sc = SparkContext(\"local\")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "499.5\n" ] } ], "source": [ "rdd = sc.parallelize(range(1000), 10)\n", "print(rdd.mean())" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "231\n" ] } ], "source": [ "data = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89]\n", "distData = sc.parallelize(data)\n", "res = distData.reduce(lambda a, b: a + b)\n", "print(res)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Row(animal=u'Frog', count(1)=2), Row(animal=u'Pig', count(1)=3), Row(animal=u'Cat', count(1)=5), Row(animal=u'Dog', count(1)=6)]\n" ] } ], "source": [ "from pyspark.sql.types import *\n", "header = 'animal,name'\n", "schema = StructType([StructField(colname, StringType(), True) for colname in header.split(',')])\n", "pets = spark.read.schema(schema).csv('gs://BUCKET_NAME/unstructured/pets.txt')\n", "\n", "pets.createOrReplaceTempView('pets')\n", "countsByPet = spark.sql('SELECT animal, COUNT(*) from pets GROUP BY animal')\n", "\n", "print(countsByPet.take(10))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(u'Frog', [u'Kermit', u'Hoppy']), (u'Pig', [u'Bacon', u'Babe', u'Tasty']), (u'Dog', [u'Noir', u'Bree', u'Pickles', u'Sparky', u'Gigi', u'Fred']), (u'Cat', [u'Tom', u'Alley', u'Cleo', u'George', u'Suzy'])]\n" ] } ], "source": [ "file = sc.textFile(\"gs://BUCKET_NAME/unstructured/pets.txt\")\n", "\n", "pets = file.map(lambda s: s.split(\",\")).map(lambda x : (x[0], [x[1]]))\n", "petsByType = pets.reduceByKey(lambda a, b: a + b)\n", "print(petsByType.take(10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.5" } }, "nbformat": 4, "nbformat_minor": 2 }