{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# about\n", "- after much frustration with hours of trial and error, finally got pyspark to run on jupyter notebook\n", "- (most help I found on the web was based on the old IPython-notebook and Spark 1.6....I wanted to get things to work on Jupyter and Spark 2.0, which I couldn't find much resource online)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "https://spark.apache.org/docs/2.0.0/quick-start.html" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Adding below in my `.bashrc` file finally solved my issues\n", "\n", "```bash\n", "export SPARK_HOME=$HOME/mybin/spark-2.0.0-bin-hadoop2.7\n", "export PATH=\"$SPARK_HOME:$PATH\"\n", "export PATH=$PATH:$SPARK_HOME/bin\n", "\n", "export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH\n", "# export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.10.1-src.zip:$PYTHONPATH\n", "\n", "export ANACONDA_ROOT=~/anaconda2\n", "export PYSPARK_DRIVER_PYTHON=$ANACONDA_ROOT/bin/jupyter\n", "export PYSPARK_DRIVER_PYTHON_OPTS='notebook' pyspark\n", "export PYSPARK_PYTHON=$ANACONDA_ROOT/bin/python\n", "```" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pyspark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'/home/takanori/mybin/spark-2.0.0-bin-hadoop2.7'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.environ['SPARK_HOME']" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "textFile = sc.textFile(os.path.join(os.environ['SPARK_HOME'],'README.md'))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "99" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "textFile.count() # Number of items in this RDD" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "u'# Apache Spark'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "textFile.first() # First item in this RDD" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "linesWithSpark = textFile.filter(lambda line: \"Spark\" in line)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "19" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "textFile.filter(lambda line: \"Spark\" in line).count() # How many lines contain \"Spark\"?" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "22" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "textFile.map(lambda line: len(line.split())).reduce(lambda a, b: a if (a > b) else b)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "22" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ ">>> def max(a, b):\n", "... if a > b:\n", "... return a\n", "... else:\n", "... return b\n", "...\n", "\n", ">>> textFile.map(lambda line: len(line.split())).reduce(max)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ ">>> wordCounts = textFile.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(u'when', 1),\n", " (u'R,', 1),\n", " (u'including', 3),\n", " (u'computation', 1),\n", " (u'using:', 1),\n", " (u'guidance', 2),\n", " (u'Scala,', 1),\n", " (u'environment', 1),\n", " (u'only', 1),\n", " (u'rich', 1),\n", " (u'Apache', 1),\n", " (u'sc.parallelize(range(1000)).count()', 1),\n", " (u'Building', 1),\n", " (u'And', 1),\n", " (u'guide,', 1),\n", " (u'return', 2),\n", " (u'Please', 3),\n", " (u'[Eclipse](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-Eclipse)',\n", " 1),\n", " (u'Try', 1),\n", " (u'not', 1),\n", " (u'Spark', 15),\n", " (u'scala>', 1),\n", " (u'Note', 1),\n", " (u'cluster.', 1),\n", " (u'./bin/pyspark', 1),\n", " (u'params', 1),\n", " (u'through', 1),\n", " (u'GraphX', 1),\n", " (u'[run', 1),\n", " (u'abbreviated', 1),\n", " (u'For', 3),\n", " (u'##', 8),\n", " (u'library', 1),\n", " (u'see', 3),\n", " (u'\"local\"', 1),\n", " (u'[Apache', 1),\n", " (u'will', 1),\n", " (u'#', 1),\n", " (u'processing,', 1),\n", " (u'for', 11),\n", " (u'[building', 1),\n", " (u'Maven', 1),\n", " (u'[\"Parallel', 1),\n", " (u'provides', 1),\n", " (u'print', 1),\n", " (u'supports', 2),\n", " (u'built,', 1),\n", " (u'[params]`.', 1),\n", " (u'available', 1),\n", " (u'run', 7),\n", " (u'tests](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools).',\n", " 1),\n", " (u'This', 2),\n", " (u'Hadoop,', 2),\n", " (u'Tests', 1),\n", " (u'example:', 1),\n", " (u'-DskipTests', 1),\n", " (u'Maven](http://maven.apache.org/).', 1),\n", " (u'thread', 1),\n", " (u'programming', 1),\n", " (u'running', 1),\n", " (u'against', 1),\n", " (u'site,', 1),\n", " (u'comes', 1),\n", " (u'package.', 1),\n", " (u'and', 11),\n", " (u'package.)', 1),\n", " (u'prefer', 1),\n", " (u'documentation,', 1),\n", " (u'submit', 1),\n", " (u'tools', 1),\n", " (u'use', 3),\n", " (u'from', 1),\n", " (u'[project', 2),\n", " (u'./bin/run-example', 2),\n", " (u'fast', 1),\n", " (u'systems.', 1),\n", " (u'', 1),\n", " (u'Hadoop-supported', 1),\n", " (u'way', 1),\n", " (u'README', 1),\n", " (u'MASTER', 1),\n", " (u'engine', 1),\n", " (u'building', 2),\n", " (u'usage', 1),\n", " (u'instance:', 1),\n", " (u'with', 4),\n", " (u'protocols', 1),\n", " (u'IDE,', 1),\n", " (u'this', 1),\n", " (u'setup', 1),\n", " (u'shell:', 2),\n", " (u'project', 1),\n", " (u'following', 2),\n", " (u'distribution', 1),\n", " (u'detailed', 2),\n", " (u'have', 1),\n", " (u'stream', 1),\n", " (u'is', 6),\n", " (u'higher-level', 1),\n", " (u'tests', 2),\n", " (u'1000:', 2),\n", " (u'sample', 1),\n", " (u'[\"Specifying', 1),\n", " (u'Alternatively,', 1),\n", " (u'file', 1),\n", " (u'need', 1),\n", " (u'You', 4),\n", " (u'instructions.', 1),\n", " (u'different', 1),\n", " (u'programs,', 1),\n", " (u'storage', 1),\n", " (u'same', 1),\n", " (u'machine', 1),\n", " (u'Running', 1),\n", " (u'which', 2),\n", " (u'you', 4),\n", " (u'A', 1),\n", " (u'About', 1),\n", " (u'sc.parallelize(1', 1),\n", " (u'locally.', 1),\n", " (u'Hive', 2),\n", " (u'optimized', 1),\n", " (u'uses', 1),\n", " (u'Version\"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version)',\n", " 1),\n", " (u'variable', 1),\n", " (u'The', 1),\n", " (u'data', 1),\n", " (u'a', 8),\n", " (u'\"yarn\"', 1),\n", " (u'Thriftserver', 1),\n", " (u'processing.', 1),\n", " (u'./bin/spark-shell', 1),\n", " (u'Python', 2),\n", " (u'Spark](#building-spark).', 1),\n", " (u'clean', 1),\n", " (u'the', 22),\n", " (u'requires', 1),\n", " (u'talk', 1),\n", " (u'help', 1),\n", " (u'Hadoop', 3),\n", " (u'-T', 1),\n", " (u'high-level', 1),\n", " (u'its', 1),\n", " (u'web', 1),\n", " (u'Shell', 2),\n", " (u'how', 2),\n", " (u'graph', 1),\n", " (u'run:', 1),\n", " (u'should', 2),\n", " (u'to', 14),\n", " (u'module,', 1),\n", " (u'given.', 1),\n", " (u'directory.', 1),\n", " (u'must', 1),\n", " (u'do', 2),\n", " (u'Programs', 1),\n", " (u'Many', 1),\n", " (u'YARN,', 1),\n", " (u'using', 5),\n", " (u'Example', 1),\n", " (u'Once', 1),\n", " (u'Spark\"](http://spark.apache.org/docs/latest/building-spark.html).', 1),\n", " (u'Because', 1),\n", " (u'name', 1),\n", " (u'Testing', 1),\n", " (u'refer', 2),\n", " (u'Streaming', 1),\n", " (u'[IntelliJ](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-IntelliJ).',\n", " 1),\n", " (u'SQL', 2),\n", " (u'them,', 1),\n", " (u'analysis.', 1),\n", " (u'set', 2),\n", " (u'Scala', 2),\n", " (u'thread,', 1),\n", " (u'individual', 1),\n", " (u'examples', 2),\n", " (u'runs.', 1),\n", " (u'Pi', 1),\n", " (u'More', 1),\n", " (u'Python,', 2),\n", " (u'Versions', 1),\n", " (u'find', 1),\n", " (u'version', 1),\n", " (u'wiki](https://cwiki.apache.org/confluence/display/SPARK).', 1),\n", " (u'`./bin/run-example', 1),\n", " (u'Configuration', 1),\n", " (u'command,', 2),\n", " (u'Maven,', 1),\n", " (u'core', 1),\n", " (u'Guide](http://spark.apache.org/docs/latest/configuration.html)', 1),\n", " (u'MASTER=spark://host:7077', 1),\n", " (u'Documentation', 1),\n", " (u'downloaded', 1),\n", " (u'distributions.', 1),\n", " (u'Spark.', 1),\n", " (u'[\"Building', 1),\n", " (u'by', 1),\n", " (u'on', 5),\n", " (u'package', 1),\n", " (u'of', 5),\n", " (u'changed', 1),\n", " (u'pre-built', 1),\n", " (u'Big', 1),\n", " (u'3\"](https://cwiki.apache.org/confluence/display/MAVEN/Parallel+builds+in+Maven+3).',\n", " 1),\n", " (u'or', 3),\n", " (u'learning,', 1),\n", " (u'locally', 2),\n", " (u'overview', 1),\n", " (u'one', 3),\n", " (u'(You', 1),\n", " (u'Online', 1),\n", " (u'versions', 1),\n", " (u'your', 1),\n", " (u'threads.', 1),\n", " (u'APIs', 1),\n", " (u'SparkPi', 2),\n", " (u'contains', 1),\n", " (u'system', 1),\n", " (u'`examples`', 2),\n", " (u'start', 1),\n", " (u'build/mvn', 1),\n", " (u'easiest', 1),\n", " (u'basic', 1),\n", " (u'more', 1),\n", " (u'option', 1),\n", " (u'that', 2),\n", " (u'N', 1),\n", " (u'\"local[N]\"', 1),\n", " (u'DataFrames,', 1),\n", " (u'particular', 2),\n", " (u'be', 2),\n", " (u'an', 4),\n", " (u'than', 1),\n", " (u'Interactive', 2),\n", " (u'builds', 1),\n", " (u'developing', 1),\n", " (u'programs', 2),\n", " (u'cluster', 2),\n", " (u'can', 7),\n", " (u'example', 3),\n", " (u'are', 1),\n", " (u'Data.', 1),\n", " (u'mesos://', 1),\n", " (u'computing', 1),\n", " (u'URL,', 1),\n", " (u'in', 6),\n", " (u'general', 2),\n", " (u'To', 2),\n", " (u'at', 2),\n", " (u'1000).count()', 1),\n", " (u'if', 4),\n", " (u'built', 1),\n", " (u'no', 1),\n", " (u'Java,', 1),\n", " (u'MLlib', 1),\n", " (u'also', 4),\n", " (u'other', 1),\n", " (u'build', 4),\n", " (u'online', 1),\n", " (u'several', 1),\n", " (u'HDFS', 1),\n", " (u'[Configuration', 1),\n", " (u'class', 2),\n", " (u'>>>', 1),\n", " (u'spark://', 1),\n", " (u'page](http://spark.apache.org/documentation.html)', 1),\n", " (u'documentation', 3),\n", " (u'It', 2),\n", " (u'graphs', 1),\n", " (u'./dev/run-tests', 1),\n", " (u'configure', 1),\n", " (u'', 1),\n", " (u'first', 1),\n", " (u'latest', 1)]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wordCounts.collect()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "19\n", "19\n" ] } ], "source": [ "linesWithSpark.cache()\n", "\n", "print linesWithSpark.count()\n", "print linesWithSpark.count()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# self-contained applications" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/takanori/mybin/spark-2.0.0-bin-hadoop2.7\n" ] } ], "source": [ "%%bash\n", "echo $SPARK_HOME" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "als.py\n", "avro_inputformat.py\n", "kmeans.py\n", "logistic_regression.py\n", "ml\n", "mllib\n", "pagerank.py\n", "parquet_inputformat.py\n", "pi.py\n", "sort.py\n", "sql\n", "sql.py\n", "status_api_demo.py\n", "streaming\n", "transitive_closure.py\n", "wordcount.py\n" ] } ], "source": [ "%%bash \n", "#ls ${SPARK_HOME}/python/pyspark\n", "ls ${SPARK_HOME}/examples/src/main/python" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "jupyter: '/home/takanori/mybin/spark-2.0.0-bin-hadoop2.7/examples/src/main/python/pi.py' is not a Jupyter command\n" ] } ], "source": [ "%%bash\n", "spark-submit ${SPARK_HOME}/examples/src/main/python/pi.py" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Note\n", "to run above, need to unset these in the commandline shell:\n", "\n", "```bash\n", "unset PYSPARK_DRIVER_PYTHON\n", "unset PYSPARK_DRIVER_PYTHON_OPTS\n", "spark-submit ${SPARK_HOME}/examples/src/main/python/pi.py\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [Root]", "language": "python", "name": "Python [Root]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 0 }